In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
from nltk.stem.snowball import SnowballStemmer
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics.pairwise import cosine_similarity
pd.set_option('display.max.columns', None)

In [None]:
metadata = pd.read_csv('/content/drive/MyDrive/Dataset/data/movies_metadata_small.csv')
keywords = pd.read_csv('/content/drive/MyDrive/Dataset/data/keywords_small.csv')
credits = pd.read_csv('/content/drive/MyDrive/Dataset/data/credits_small.csv')

In [None]:
metadata.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [None]:
metadata.isnull().sum()

adult                       0
belongs_to_collection    7683
budget                      0
genres                      0
homepage                 7252
id                          0
imdb_id                     2
original_language           0
original_title              0
overview                   18
popularity                  0
poster_path                 3
production_companies        0
production_countries        0
release_date                2
revenue                     0
runtime                     1
spoken_languages            0
status                      2
tagline                  2303
title                       0
video                       0
vote_average                0
vote_count                  0
dtype: int64

In [None]:
metadata['adult'].value_counts()

False    9542
Name: adult, dtype: int64

In [None]:
metadata['video'].value_counts()

False    9536
True        6
Name: video, dtype: int64

In [None]:
metadata['budget'].value_counts().head()

0           4613
20000000     166
30000000     161
15000000     160
25000000     152
Name: budget, dtype: int64

In [None]:
metadata['revenue'].value_counts().head()

0.0           4538
10000000.0      10
12000000.0      10
11000000.0      10
7000000.0        9
Name: revenue, dtype: int64

In [None]:
metadata[metadata['original_title']!=metadata['title']][['original_title','title']].head(5)

Unnamed: 0,original_title,title
28,La Cité des Enfants Perdus,The City of Lost Children
29,摇啊摇，摇到外婆桥,Shanghai Triad
52,Il postino,The Postman
60,Gazon maudit,French Twist
65,Les misérables,Les Miserables


In [None]:
metadata.drop(columns=['adult','belongs_to_collection','budget','homepage','imdb_id','original_language','original_title',
                       'poster_path','production_companies','production_countries','revenue','runtime',
                       'spoken_languages','status','video','vote_count'], inplace=True)

In [None]:
metadata.isnull().sum()

genres             0
id                 0
overview          18
popularity         0
release_date       2
tagline         2303
title              0
vote_average       0
dtype: int64

In [None]:
metadata[metadata['release_date'].isna()]

Unnamed: 0,genres,id,overview,popularity,release_date,tagline,title,vote_average
7555,[],367647,Documentary Follow James Burke through the hi...,0.035294,,,Connections,9.0
9464,[],409926,Astronomer Dr. Carl Sagan is host and narrator...,0.282584,,,Cosmos,9.1


In [None]:
metadata.dropna(subset=['release_date'], inplace=True) # Since both are unpopular movies
metadata['release_date'] = metadata['release_date'].apply(lambda x : int(x[0:4]))
metadata = metadata[metadata['release_date']>1960].reset_index(drop=True) # removing very old movies
metadata.fillna("",inplace=True) # we can fill overview and tagline with empty strings
metadata.isnull().sum()

genres          0
id              0
overview        0
popularity      0
release_date    0
tagline         0
title           0
vote_average    0
dtype: int64

In [None]:
def get_genres(genres):
    return ' '.join([i['name'] for i in genres])
def get_cast(cast):
    if len(cast)>7:
        cast=cast[:7];
    return [people['name'].replace(' ','').replace('-','') for people in cast]
def get_director(crew):
    for person in crew:
        if person['job']=='Director':
            return [person['name'].replace(' ','').replace('-','')]
    return []
def get_people(x):
    people = x['cast'] if x['genres'].find('Animation')==-1 else x['cast'][0:3]
    for d in x['director']:
        if d not in people:
            people.append(d)
    return ' '.join(people)
def get_keywords(keywords):
    return ' '.join([i['name'] for i in keywords])

In [None]:
metadata['genres']= metadata['genres'].apply(literal_eval).apply(get_genres)
credits['cast'] = credits['cast'].apply(literal_eval).apply(get_cast)
credits['director'] = credits['crew'].apply(literal_eval).apply(get_director)
keywords['keywords'] = keywords['keywords'].apply(literal_eval).apply(get_keywords)

In [None]:
info = pd.merge(credits[['id', 'cast', 'director']], keywords, how='inner', on='id')
metadata = pd.merge(metadata, info, how='inner', on='id')
metadata.head(2)

Unnamed: 0,genres,id,overview,popularity,release_date,tagline,title,vote_average,cast,director,keywords
0,Animation Comedy Family,862,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995,,Toy Story,7.7,"[TomHanks, TimAllen, DonRickles, JimVarney, Wa...",[JohnLasseter],jealousy toy boy friendship friends rivalry bo...
1,Adventure Fantasy Family,8844,When siblings Judy and Peter discover an encha...,17.015539,1995,Roll the dice and unleash the excitement!,Jumanji,6.9,"[RobinWilliams, JonathanHyde, KirstenDunst, Br...",[JoeJohnston],board game disappearance based on children's b...


In [None]:
metadata['keys']=metadata.apply(lambda x : x['title']+" "+x['overview']+" "+x['tagline']+" "+x['keywords'],axis=1)
metadata['people'] = metadata.apply(get_people, axis=1)
metadata = metadata[['id','title','people','genres','keys','popularity','release_date','vote_average']].copy()
metadata.head(2)

Unnamed: 0,id,title,people,genres,keys,popularity,release_date,vote_average
0,862,Toy Story,TomHanks TimAllen DonRickles JohnLasseter,Animation Comedy Family,"Toy Story Led by Woody, Andy's toys live happi...",21.946943,1995,7.7
1,8844,Jumanji,RobinWilliams JonathanHyde KirstenDunst Bradle...,Adventure Fantasy Family,Jumanji When siblings Judy and Peter discover ...,17.015539,1995,6.9


In [None]:
stemmer = SnowballStemmer(language='english')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def get_keys(x):
    x = x.translate(str.maketrans('', '', string.punctuation))
    return ' '.join([stemmer.stem(w) for w in x.split() if stemmer.stem(w) not in stopwords])
metadata['keys'] = metadata['keys'].apply(get_keys)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
transformer = QuantileTransformer(output_distribution='uniform')
metadata['popularity'] = pd.Series(transformer.fit_transform(metadata[['popularity']]).reshape(-1)-0.5)/2
metadata['vote_average'] = metadata['vote_average'].apply(lambda x : np.nan if x==0 else x)
mean_average = metadata['vote_average'].mean()
metadata.fillna(mean_average, inplace=True)
metadata['vote_average'] = (metadata['vote_average']/20)-0.25
metadata = metadata.sort_values('popularity', ascending=False).reset_index(drop=True)
metadata.head(2)

Unnamed: 0,id,title,people,genres,keys,popularity,release_date,vote_average
0,211672,Minions,SandraBullock JonHamm MichaelKeaton KyleBalda,Family Animation Adventure Comedy,minion minion stuart kevin bob recruit scarlet...,0.25,2015,0.07
1,297762,Wonder Woman,GalGadot ChrisPine RobinWright DannyHuston Dav...,Action Adventure Fantasy,wonder woman amazon princess come world man be...,0.249674,2017,0.11


In [None]:
movie_id = metadata[['id']].values
movie_title = metadata['title'].drop_duplicates()
movie_ind = dict(zip(movie_title.values, movie_title.index))

In [None]:
obj_cnt = CountVectorizer(lowercase=True, analyzer='word', min_df=5)
cnt_val = obj_cnt.fit_transform(metadata['people']).toarray()/2
obj_genre = TfidfVectorizer()
genre_val = obj_genre.fit_transform(metadata['genres']).toarray()/3
obj_tfidf = TfidfVectorizer(min_df=10)
tfidf_val = obj_tfidf.fit_transform(metadata['keys']).toarray()
num_val = metadata[['vote_average']].values
movie_id.shape, num_val.shape, cnt_val.shape, genre_val.shape, tfidf_val.shape

((8849, 1), (8849, 1), (8849, 3119), (8849, 22), (8849, 4696))

In [None]:
col_names=['vote_average'] + list(obj_cnt.get_feature_names_out()) + list(obj_genre.get_feature_names_out()) + list(obj_tfidf.get_feature_names_out())
features = np.concatenate((num_val, cnt_val, genre_val, tfidf_val), axis=1)
similarity = cosine_similarity(features, features)
len(col_names), features.shape, similarity.shape

(7838, (8849, 7838), (8849, 8849))

In [None]:
def get_similar_from_title(movie_name):
    try:
        index = movie_ind[movie_name]
        print(f"given movie index : {index}")
        scores = list(enumerate(similarity[index]))
        scores = sorted(scores, key = lambda x : x[1], reverse=True)
        top_mv_index = [i[0] for i in scores[1:21]]
        top_mv_score = pd.Series([i[1] for i in scores[1:21]], index=top_mv_index, name='score')
        df = pd.concat([metadata[['title', 'popularity', 'vote_average']].iloc[top_mv_index], top_mv_score],axis=1)
        df['total'] = df['score'] + (df['popularity']+df['vote_average'])/2
        df = df.sort_values('total',ascending=False)
        df = df[(df['popularity']>0) & (df['vote_average']>0)].copy()
        return df[['title']][0:10]
    except:
        print("Please enter correct Movie name")

In [None]:
get_similar_from_title('The Avengers')

given movie index : 17


Unnamed: 0,title
51,Avengers: Age of Ultron
12,Captain America: Civil War
287,Captain America: The Winter Soldier
27,Thor: Ragnarok
62,Thor: The Dark World
98,Thor
272,Iron Man 2
173,Iron Man
147,Iron Man 3
256,Captain America: The First Avenger


In [None]:
get_similar_from_title('Batman Begins')

given movie index : 103


Unnamed: 0,title
15,The Dark Knight
205,The Dark Knight Rises
385,The Prestige
100,Inception
73,Interstellar
1324,"Batman: The Dark Knight Returns, Part 1"
1027,Quest for Camelot
1514,Zulu
611,A Walk Among the Tombstones
2208,Batman: Gotham Knight


In [None]:
get_similar_from_title('Forrest Gump')

given movie index : 34


Unnamed: 0,title
186,Cast Away
285,Apollo 13
732,The Polar Express
839,White Oleander
931,Philadelphia
1519,Fences
2180,Sleepless in Seattle
2177,You've Got Mail
2309,Uncommon Valor
2245,The Assassination of Richard Nixon


In [None]:
def why_two_movie_are_similar(m1, m2):
    score = list(enumerate(features[m1]*features[m2]))
    score = sorted(score, key = lambda x : x[1], reverse=True)
    keywords = [(col_names[i[0]],i[1]) for i in score[0:10] if i[1]>0]
    return keywords

In [None]:
why_two_movie_are_similar(103,205)

[('christianbale', 0.25),
 ('christophernolan', 0.25),
 ('garyoldman', 0.25),
 ('michaelcaine', 0.25),
 ('gotham', 0.10157255609732047),
 ('batman', 0.09154618584915034),
 ('crime', 0.04671533885462725),
 ('crime', 0.03406942314834019),
 ('action', 0.03331964684595626),
 ('citi', 0.02951014336595438)]