In [25]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [26]:
main_df = pd.read_csv('Data/TMDB_movie_dataset_v11.csv')

In [27]:
df = main_df[main_df['vote_average']!=0]
df.reset_index(inplace=True)

In [28]:
df.shape

(349257, 25)

In [29]:
df.columns

Index(['index', 'id', 'title', 'vote_average', 'vote_count', 'status',
       'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path',
       'budget', 'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [31]:
df = df.drop( ['id' , 'vote_count' , 'status' ,  'revenue' , 'backdrop_path',
              'budget','homepage','imdb_id','original_title' , 'overview','poster_path',
              'tagline' , 'production_companies','production_countries' ,'spoken_languages' ,'keywords'], axis=1)

In [32]:
df['org_title']=df['title']

In [33]:
df.isna().sum()

index                    0
title                    0
vote_average             0
release_date          6872
runtime                  0
adult                    0
original_language        0
popularity               0
genres               59073
org_title                0
dtype: int64

In [34]:
df['genres'] = df['genres'].fillna('unknown')


In [36]:
df = df.dropna()

In [37]:
df.isna().sum()

index                0
title                0
vote_average         0
release_date         0
runtime              0
adult                0
original_language    0
popularity           0
genres               0
org_title            0
dtype: int64

In [38]:
df.duplicated().sum()

0

In [39]:
dff= df.copy()

In [40]:
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

genre_l = dff['genres'].apply(lambda x: x.split(','))
genre_l = pd.DataFrame(genre_l)

In [41]:
genre_l['genres'] = genre_l['genres'].apply(lambda x :[ y.strip().lower().replace(' ','') for y in x] )

In [42]:
MLB = MultiLabelBinarizer()

genre_encoded = MLB.fit_transform(genre_l['genres'])

In [43]:
genre_encoded

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [44]:
genre_encoded_df = pd.DataFrame(genre_encoded, columns=MLB.classes_)
genre_encoded_df=genre_encoded_df.reset_index()

In [45]:
mod_df = dff.drop(['genres'],axis=1)
mod_df=mod_df.reset_index()

In [46]:
df = pd.concat([mod_df,genre_encoded_df],axis=1).drop('index',axis=1)

In [47]:
df.head()

Unnamed: 0,level_0,title,vote_average,release_date,runtime,adult,original_language,popularity,org_title,action,...,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,unknown,war,western
0,0,Inception,8.364,2010-07-15,148,False,en,83.952,Inception,1,...,0,0,0,0,1,0,0,0,0,0
1,1,Interstellar,8.417,2014-11-05,169,False,en,140.241,Interstellar,0,...,0,0,0,0,1,0,0,0,0,0
2,2,The Dark Knight,8.512,2008-07-16,152,False,en,130.643,The Dark Knight,1,...,0,0,0,0,0,1,0,0,0,0
3,3,Avatar,7.573,2009-12-15,162,False,en,79.932,Avatar,1,...,0,0,0,0,1,0,0,0,0,0
4,4,The Avengers,7.71,2012-04-25,143,False,en,98.082,The Avengers,1,...,0,0,0,0,1,0,0,0,0,0


In [48]:
df['title'] = df['title'].apply(lambda x :x.strip().lower().replace(' ','') )
df['original_language'] = df['original_language'].apply(lambda x :x.strip().lower().replace(' ','') )

In [49]:
df.loc[~( (df['original_language']=='en')|(df['original_language']=='fr')|(df['original_language']=='es')|(df['original_language']=='de')|(df['original_language']=='ja')),'original_language'] = 'else'

In [50]:
OHE = OneHotEncoder(sparse_output=False)

In [51]:
df['adult'] = df['adult'].astype('str')
adult_enc = OHE.fit_transform(df[['adult']])
adult_enc_df = pd.DataFrame(adult_enc,columns=OHE.get_feature_names_out())

In [52]:
adult_enc_df = adult_enc_df.drop('adult_True',axis=1)


In [53]:
lang_enc = OHE.fit_transform(df[['original_language']])
lang_enc_df = pd.DataFrame(lang_enc,columns=OHE.get_feature_names_out())

In [54]:
mod_df = df.drop(['adult','original_language'],axis=1)

In [55]:
df = pd.concat([mod_df,adult_enc_df,lang_enc_df],axis=1)

In [58]:
df.head()

Unnamed: 0,level_0,title,vote_average,runtime,popularity,org_title,action,adventure,animation,comedy,...,war,western,adult_False,original_language_de,original_language_else,original_language_en,original_language_es,original_language_fr,original_language_ja,release_year
0,0,inception,8.364,148,83.952,Inception,1,1,0,0,...,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2010
1,1,interstellar,8.417,169,140.241,Interstellar,0,1,0,0,...,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2014
2,2,thedarkknight,8.512,152,130.643,The Dark Knight,1,0,0,0,...,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2008
3,3,avatar,7.573,162,79.932,Avatar,1,1,0,0,...,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2009
4,4,theavengers,7.71,143,98.082,The Avengers,1,1,0,0,...,0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2012


In [57]:
# Converting 'release_date' to a datetime object and extract 
df['release_year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year
df.drop(columns=['release_date'], inplace=True)

In [59]:
from sklearn.preprocessing import StandardScaler
SC = StandardScaler()
df_norm = SC.fit_transform(df.drop(['title','org_title'],axis=1))
df_norm_df = pd.DataFrame(df_norm, columns=[x for x in df.columns if x not in ['title', 'org_title']])
df = pd.concat([df[['title','org_title']],df_norm_df],axis=1)
df.head()

Unnamed: 0,title,org_title,level_0,vote_average,runtime,popularity,action,adventure,animation,comedy,...,war,western,adult_False,original_language_de,original_language_else,original_language_en,original_language_es,original_language_fr,original_language_ja,release_year
0,inception,Inception,-1.716374,1.186226,1.183474,5.995571,3.441525,4.767489,-0.25647,-0.52945,...,-0.133758,-0.115183,0.244813,-0.214907,-0.591061,0.966384,-0.273582,-0.275403,-0.20339,0.47858
1,interstellar,Interstellar,-1.716364,1.213559,1.507313,10.146431,-0.290569,4.767489,-0.25647,-0.52945,...,-0.133758,-0.115183,0.244813,-0.214907,-0.591061,0.966384,-0.273582,-0.275403,-0.20339,0.629484
2,thedarkknight,The Dark Knight,-1.716354,1.262553,1.245158,9.438656,3.441525,-0.209754,-0.25647,-0.52945,...,-0.133758,-0.115183,0.244813,-0.214907,-0.591061,0.966384,-0.273582,-0.275403,-0.20339,0.403128
3,avatar,Avatar,-1.716344,0.778291,1.399367,5.699128,3.441525,4.767489,-0.25647,-0.52945,...,-0.133758,-0.115183,0.244813,-0.214907,-0.591061,0.966384,-0.273582,-0.275403,-0.20339,0.440854
4,theavengers,The Avengers,-1.716334,0.848945,1.106369,7.037545,3.441525,4.767489,-0.25647,-0.52945,...,-0.133758,-0.115183,0.244813,-0.214907,-0.591061,0.966384,-0.273582,-0.275403,-0.20339,0.554032


In [60]:
df = df.drop_duplicates(subset=['title'])
df=df.set_index(['title'])
df_fin=df.drop('org_title',axis=1)

In [61]:
movie_name = 'the dark knight'
movie_name=movie_name.strip().lower().replace(' ','')
new_df= df_fin.loc[[movie_name]]
new_df = new_df.values.reshape(1,-1)
from sklearn.metrics.pairwise import cosine_similarity
df_other = df_fin.loc[df_fin.index!=movie_name,:]
df_titles = df.loc[df.index!=movie_name,'org_title']
cosine_sim_matrix = cosine_similarity(new_df,df_other)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix,index=[movie_name],columns=df_titles)

cosine_sim_df

org_title,Inception,Interstellar,Avatar,The Avengers,Deadpool,Avengers: Infinity War,Fight Club,Guardians of the Galaxy,Pulp Fiction,Forrest Gump,...,Stævnemøde,Kirby at War: La Guerre De Kirby,Rathnan Prapancha,Iron Deficiency,AVP The Monster Hydro Cup Day 2-1: Men’s Semi-Final 1 - Dalhausser and Lucena vs Tr. Crabb and Bourne,Hollow Water,Prison Circle,"Présidentielle, une épreuve d'artistes",Z Argentiny do Mexika,Don Gio
thedarkknight,0.590218,0.673101,0.502956,0.630106,0.627943,0.732171,0.777205,0.353011,0.885664,0.697927,...,-0.238872,-0.063554,-0.048936,-0.179562,-0.164487,-0.135371,-0.148806,-0.152222,-0.235339,-0.150057


In [62]:
sorted_row = cosine_sim_df.loc[movie_name].sort_values(ascending=False)[0:20]

In [64]:
sorted_row.index[:5]

Index(['Muzzle', 'The Equalizer', 'The Dark Knight Rises',
       'John Wick: Chapter 2', 'John Wick: Chapter 3 - Parabellum'],
      dtype='object', name='org_title')