In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

In [2]:
movies=pd.read_csv('movies_metadata.csv',low_memory=False)

In [3]:
movies.drop(['adult','budget','homepage','imdb_id','original_language','release_date'	,'revenue'	,'runtime',	'spoken_languages'	,'status',	'tagline','video','popularity','poster_path','production_companies','production_countries','original_title','belongs_to_collection'],axis=1,inplace=True)

In [4]:
movies.head()

Unnamed: 0,genres,id,overview,title,vote_average,vote_count
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,5415.0
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,Jumanji,6.9,2413.0
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,Grumpier Old Men,6.5,92.0
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,6.1,34.0
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,Father of the Bride Part II,5.7,173.0


In [5]:
df=movies.copy()

In [6]:
df.genres=df.genres.apply(ast.literal_eval)
df=df.explode('genres', ignore_index=False)
df['genres'] = df['genres'].apply(lambda x: {'id': 0, 'name': np.nan} if pd.isna(x) else x)
df[['genre_id', 'genre']] =df['genres'].apply(pd.Series)
df =df.drop(columns=['genres','genre_id'])
df.index.name = 'index'
df.head(10)

Unnamed: 0_level_0,id,overview,title,vote_average,vote_count,genre
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,862,"Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,5415.0,Animation
0,862,"Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,5415.0,Comedy
0,862,"Led by Woody, Andy's toys live happily in his ...",Toy Story,7.7,5415.0,Family
1,8844,When siblings Judy and Peter discover an encha...,Jumanji,6.9,2413.0,Adventure
1,8844,When siblings Judy and Peter discover an encha...,Jumanji,6.9,2413.0,Fantasy
1,8844,When siblings Judy and Peter discover an encha...,Jumanji,6.9,2413.0,Family
2,15602,A family wedding reignites the ancient feud be...,Grumpier Old Men,6.5,92.0,Romance
2,15602,A family wedding reignites the ancient feud be...,Grumpier Old Men,6.5,92.0,Comedy
3,31357,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,6.1,34.0,Comedy
3,31357,"Cheated on, mistreated and stepped on, the wom...",Waiting to Exhale,6.1,34.0,Drama


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosim

In [8]:
genre_updated=df.groupby('id')['genre'].sum().reset_index()

In [9]:
merged_df = pd.merge(genre_updated,movies, on='id')
merged_df=merged_df.drop(columns=['genres'])


In [10]:
tv =TfidfVectorizer(max_features=1800, stop_words='english')
merged_df['overview'] = merged_df['overview'].fillna('')
vectors = tv.fit_transform(merged_df['overview']).toarray()

In [11]:
similarity=np.float32(cosim(vectors))
similarity

array([[1.        , 0.0725389 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0725389 , 1.        , 0.        , ..., 0.06210516, 0.        ,
        0.05444781],
       [0.        , 0.        , 1.        , ..., 0.        , 0.05511521,
        0.        ],
       ...,
       [0.        , 0.06210516, 0.        , ..., 1.        , 0.06606935,
        0.        ],
       [0.        , 0.        , 0.05511521, ..., 0.06606935, 1.        ,
        0.07135075],
       [0.        , 0.05444781, 0.        , ..., 0.        , 0.07135075,
        1.        ]], dtype=float32)

In [12]:
def recommend(Movie):
    index=merged_df[merged_df['title']==Movie].index[0]
    movie_list=sorted(list(enumerate(similarity[index])),reverse=True, key=lambda x: x[1])[1:10]
    for i in movie_list:
        print(merged_df.loc[i[0],'title'])

In [13]:
recommend('The Prestige')	

Night Moves
Nightmares
The Keeper
Dig Two Graves
Cold Heart
The Great Gatsby
Linsanity
Circumstance
Forbidden Photos of a Lady Above Suspicion


In [14]:
def bayesian_average(vote_average, vote_count, global_avg, total_count, C=50):
    return (C * global_avg + vote_count * vote_average) / (C + vote_count)

In [15]:
global_avg_rating=merged_df.vote_average.mean()
total_vote_count=merged_df.vote_count.sum()
merged_df['bayesian_average'] = merged_df.apply(lambda row: bayesian_average(
    row['vote_average'],
    row['vote_count'],
    global_avg_rating,
    total_vote_count
), axis=1)

In [16]:
new_df=pd.merge(df, merged_df[['id', 'bayesian_average']], on='id')

In [17]:
genre_recommendation = new_df.groupby('genre').agg(
    bayesian_avg_mean=('bayesian_average', 'mean'),
    vote_count=('title', 'count')
)
genre_recommendation=genre_recommendation.reset_index()
print(genre_recommendation)


                                    genre  bayesian_avg_mean  vote_count
0                                  Action           5.765853        6602
1                               Adventure           5.849150        3508
2                               Animation           6.009152        1942
3                                 Aniplex                NaN           0
4                               BROSTA TV                NaN           0
5                    Carousel Productions                NaN           0
6                                  Comedy           5.775823       13194
7                                   Crime           5.888704        4313
8                             Documentary           5.795815        3936
9                                   Drama           5.859813       20306
10                                 Family           5.821170        2776
11                                Fantasy           5.838848        2321
12                                Foreign          

In [18]:
def movie_genre(Genre,n=10):
    sample_df=pd.DataFrame({})
    sample_df['title']=[]
    sample_df['ratings']=[]
    #index_=genre_recommendation[genre_recommendation['genre']==Genre]
    mean=genre_recommendation[genre_recommendation['genre']==Genre]['bayesian_avg_mean']
    genre_df=merged_df[merged_df['genre']==Genre]
    for i in range(len(genre_df)):
        if genre_df.iloc[i]['bayesian_average']>=mean.values:
            sample_df.loc[len(sample_df)]=[genre_df.iloc[i]['title'],genre_df.iloc[i]['bayesian_average']]
    sample_df.sort_values(by='ratings', ascending=False, inplace=True)
    print(sample_df.head(n))
            
    
    

In [19]:
movie_genre('Romance',20)

                      title   ratings
10                   Grease  7.153007
1                Safe Haven  6.827989
6    Palm Trees in the Snow  6.802428
3   Yeh Jawaani Hai Deewani  6.530080
9                     Signs  6.257825
0      Under the Tuscan Sun  6.228554
4                    Hawaii  6.162644
7     Turn Left, Turn Right  5.944609
11              April Story  5.942349
2              Henry & June  5.921226
8                  Amor.com  5.918065
5                  No Entry  5.867383


In [20]:
import pickle

In [27]:
pickle.dump(merged_df,open('movies.pkl','wb'))

In [23]:
from joblib import dump
dump(similarity, 'similarity___.npz')


['similarity___.npz']

In [24]:
recommend('The Prestige')

Night Moves
Nightmares
The Keeper
Dig Two Graves
Cold Heart
The Great Gatsby
Linsanity
Circumstance
Forbidden Photos of a Lady Above Suspicion
