In [None]:
import tensorflow_hub as hub
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [None]:
# Load pre-trained universal sentence encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/movie_dataset/movies_metadata.csv', low_memory=False)

In [None]:
movies = movies.drop([19730, 29503, 35587])  # remove damaged data
movies['id'] = movies['id'].astype('int64')

In [None]:
movies.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   3
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      3
runtime                    260
spoken_languages             3
status                      84
tagline                  25051
title                        3
video                        3
vote_average                 3
vote_count                   3
dtype: int64

In [None]:
movies = movies[movies.overview.notna()]

In [None]:
movies = movies.drop_duplicates(subset='overview')

In [None]:
movies = movies[movies['overview'].str.len() > 50]

In [None]:
movies = movies[movies['adult'] == 'False']

In [None]:
movies = movies.reset_index(drop=True)

In [None]:
embeddings = embed(movies['overview'])
embeddings

<tf.Tensor: shape=(43911, 512), dtype=float32, numpy=
array([[ 0.0623784 , -0.00021258, -0.03214182, ..., -0.02816204,
         0.06704462,  0.03110415],
       [ 0.01738841, -0.040174  ,  0.00191935, ..., -0.00705751,
         0.08162364,  0.03460284],
       [ 0.00079044,  0.03408437, -0.03159757, ..., -0.02167954,
         0.01332099,  0.0119028 ],
       ...,
       [ 0.00288407,  0.02441899,  0.00214455, ..., -0.050444  ,
         0.0576336 , -0.02168424],
       [-0.03068469,  0.03163631, -0.0301782 , ..., -0.04808713,
         0.04038424,  0.03479053],
       [ 0.06090392,  0.00747778,  0.0035212 , ..., -0.04336387,
        -0.00491572,  0.02921174]], dtype=float32)>

In [None]:
k = 14
model = KMeans(n_clusters=k, init='k-means++', max_iter=10000, n_init=20)
model.fit(embeddings)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=10000,
       n_clusters=14, n_init=20, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [None]:
# assign cluster number to movies
movies['cluster'] = model.labels_

# IMDB weighted average rating

In [None]:
# Calculate mean of vote average column
C = movies['vote_average'].mean()
print(C)

# Calculate the minimum number of votes required to be in the chart, m
m = movies['vote_count'].quantile(0.90)
print(m)

# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

5.65314400236834
168.0


In [None]:
movies['score'] = movies.apply(weighted_rating, axis=1)

# Sort movies and export final dataset

In [None]:
# sort movies by cluster number
movies = movies.sort_values(by=['cluster','score'], ascending=[True, False])

In [None]:
movies[movies.title == 'The Dark Knight']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cluster,score
12388,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",http://thedarkknight.warnerbros.com/dvdsite/,155,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,123.167259,/1hRoyzDtpgMU7Dz4JF22RANzQO7.jpg,"[{'name': 'DC Comics', 'id': 429}, {'name': 'L...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2008-07-16,1004558000.0,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Why So Serious?,The Dark Knight,False,8.3,12269.0,2,8.264246
28152,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,72003,tt2258647,en,The Dark Knight,In a post-apocalyptic world ravaged by feuding...,0.077992,/kyjTDE5vldkUpJGErAvqYY6J92M.jpg,[],[],2011-07-11,0.0,86.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Dark Knight,False,6.3,2.0,10,5.660754


In [None]:
movies.to_csv('movies.csv')

# process and filter data

In [None]:
from ast import literal_eval
from numpy import NaN

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/movies.csv', low_memory=False)
keywords = pd.read_csv('/content/drive/MyDrive/movie_dataset/keywords.csv')

In [None]:
movies['genres'] = movies['genres'].apply(literal_eval)
keywords['keywords'] = keywords['keywords'].apply(literal_eval)

In [None]:
def dict_to_list(dict_):
  return [x['name'].lower() for x in dict_]

In [None]:
movies['genres'] = movies['genres'].apply(dict_to_list)
keywords['keywords'] = keywords['keywords'].apply(dict_to_list)

In [None]:
movies.shape

(43911, 27)

In [None]:
# join movies & keywords
merged = movies.join(keywords, on='id', rsuffix='_k')

In [None]:
merged.head()

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cluster,score,id_k,keywords
0,1182,False,,16400000,[drama],,207,tt0097165,en,Dead Poets Society,"At an elite, old-fashioned boarding school in ...",19.905716,/3Ri2GReavqSHqWemlP6HYn8i2P9.jpg,"[{'name': 'Touchstone Pictures', 'id': 9195}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",1989-06-02,235860116.0,129.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,He was their inspiration. He made their lives ...,Dead Poets Society,False,8.1,2786.0,0,7.960842,65796.0,"[opium, loss of eyesight, biography, sioux, wi..."
1,22472,False,,30000000,"[comedy, drama]",,120467,tt2278388,en,The Grand Budapest Hotel,The Grand Budapest Hotel tells of a legendary ...,14.442048,/nX5XotM9yprCKarRH4fzOq1VM1J.jpg,"[{'name': 'Fox Searchlight Pictures', 'id': 43...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2014-02-26,174600318.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A perfect holiday without leaving home.,The Grand Budapest Hotel,False,8.0,4644.0,0,7.918065,,
2,11233,False,,90000000,"[drama, thriller, crime]",http://thedeparted.warnerbros.com/,1422,tt0407887,en,The Departed,"To take down South Boston's Irish Mafia, the p...",18.515448,/tGLO9zw5ZtCeyyEWgbYGgsFxC6i.jpg,"[{'name': 'Vertigo Entertainment', 'id': 829},...","[{'iso_3166_1': 'HK', 'name': 'Hong Kong'}, {'...",2006-10-05,289847354.0,151.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Lies. Betrayal. Sacrifice. How far will you ta...,The Departed,False,7.9,4455.0,0,7.818349,11844.0,"[male friendship, van, snack bar, soccer, unem..."
3,2708,False,,15000000,[drama],http://www.dreamworks.com/ab/,14,tt0169547,en,American Beauty,"Lester Burnham, a depressed suburban father in...",20.726578,/or1MP8BZIAjqWYxPdPX724ydKar.jpg,"[{'name': 'DreamWorks SKG', 'id': 27}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1999-09-15,356296601.0,122.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Look closer.,American Beauty,False,7.9,3438.0,0,7.795321,1408.0,"[exotic island, treasure, map, ship, scalp, pi..."
4,722,False,,1800000,"[drama, comedy, war]",,935,tt0057012,en,Dr. Strangelove or: How I Learned to Stop Worr...,Insane General Jack D. Ripper initiates a nucl...,9.80398,/tviJ68Wj4glQk3CPMvdvExYHxX.jpg,"[{'name': 'Hawk Films', 'id': 88}, {'name': 'C...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",1964-01-29,9440272.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The hot-line suspense comedy,Dr. Strangelove or: How I Learned to Stop Worr...,False,8.0,1472.0,0,7.75959,26378.0,"[detective, arizona, ranch, boxer, boxing matc..."


In [None]:
merged.columns

Index(['Unnamed: 0', 'adult', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cluster', 'score', 'id_k', 'keywords'],
      dtype='object')

In [None]:
# change nan to empty list []
merged['keywords'] = merged['keywords'].apply(lambda k: [] if k is NaN else k)

In [None]:
merged = merged[['id', 'imdb_id', 'title', 'release_date', 'score', 'overview',  'genres', 'keywords', 'cluster']]

In [None]:
merged = merged.rename(columns={'id': 'tmdb_id'})

In [None]:
merged['release_date'] = pd.to_datetime(merged['release_date'])

### change genres & keywords lists to string

In [None]:
merged['genres'] = merged['genres'].apply(lambda g: '|'.join(g))
merged['keywords'] = merged['keywords'].apply(lambda k: '|'.join(k))
merged.head()

Unnamed: 0,tmdb_id,imdb_id,title,release_date,score,overview,genres,keywords,cluster
0,207,tt0097165,Dead Poets Society,1989-06-02,7.960842,"At an elite, old-fashioned boarding school in ...",drama,opium|loss of eyesight|biography|sioux|wild bi...,0
1,120467,tt2278388,The Grand Budapest Hotel,2014-02-26,7.918065,The Grand Budapest Hotel tells of a legendary ...,comedy|drama,,0
2,1422,tt0407887,The Departed,2006-10-05,7.818349,"To take down South Boston's Irish Mafia, the p...",drama|thriller|crime,male friendship|van|snack bar|soccer|unemployment,0
3,14,tt0169547,American Beauty,1999-09-15,7.795321,"Lester Burnham, a depressed suburban father in...",drama,exotic island|treasure|map|ship|scalp|pirate,0
4,935,tt0057012,Dr. Strangelove or: How I Learned to Stop Worr...,1964-01-29,7.75959,Insane General Jack D. Ripper initiates a nucl...,drama|comedy|war,detective|arizona|ranch|boxer|boxing match|spo...,0


In [None]:
merged.isna().sum()

tmdb_id          0
imdb_id         15
title            2
release_date    67
score            2
overview         0
genres           0
keywords         0
cluster          0
dtype: int64

In [None]:
merged[merged['title'].isna()]

Unnamed: 0,tmdb_id,imdb_id,title,release_date,score,overview,genres,keywords,cluster
22048,82663,tt0113002,,NaT,,British soldiers force a recently captured IRA...,action|thriller|drama,,6
32494,249260,tt2622826,,NaT,,A group of skiers are terrorized during spring...,tv movie|action|horror|science fiction,,10


In [None]:
merged.at[22048, 'title'] = 'Midnight Man'
merged.at[32494, 'title'] = 'Avalanche Sharks'

In [None]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43911 entries, 0 to 43910
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   tmdb_id       43911 non-null  int64         
 1   imdb_id       43896 non-null  object        
 2   title         43911 non-null  object        
 3   release_date  43844 non-null  datetime64[ns]
 4   score         43909 non-null  float64       
 5   overview      43911 non-null  object        
 6   genres        43911 non-null  object        
 7   keywords      43911 non-null  object        
 8   cluster       43911 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 3.0+ MB


In [None]:
merged['cluster'].value_counts()

12    4364
1     4177
13    4130
4     3833
6     3330
0     3281
11    2922
8     2911
2     2712
3     2637
9     2572
7     2551
10    2412
5     2079
Name: cluster, dtype: int64

In [None]:
merged = merged.reset_index(drop=True)

In [None]:
merged.to_csv('movies.csv')