In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_credits=pd.read_csv('tmdb/tmdb_5000_credits.csv')
df_movies=pd.read_csv('tmdb/tmdb_5000_movies.csv')


In [3]:
df_movies=df_movies.drop(columns = ['homepage','budget','production_companies','production_countries','revenue','runtime','status','tagline','original_language','original_title'], axis = 1)
df_credits.columns=['id','title','cast','crew']
df_credits=df_credits.drop(columns=['title'])


In [4]:
df_movies=df_movies.merge(df_credits,on='id')
df_movies.head(1)

Unnamed: 0,genres,id,keywords,overview,popularity,release_date,spoken_languages,title,vote_average,vote_count,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",150.437577,2009-12-10,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Add imdb rating column 

In [5]:
# clac mean value
c=df_movies['vote_average'].mean()
# the minimum votes to display
m=df_movies['vote_count'].quantile(0.5)
print(c,m)

6.092171559442011 235.0


In [6]:
movies_list=df_movies.copy().loc[df_movies['vote_count']>=m]
movies_list.shape


(2407, 12)

In [8]:
def weighted_rating(x,m=m,c=c):
    v=x['vote_count']
    r=x['vote_average']
    return (v/(v+m)*r)+(m/(m+v)*c)

In [9]:
movies_list['score']=movies_list.apply(weighted_rating,axis=1)
movies_list=movies_list.sort_values('score',ascending=False)
movies_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2407 entries, 1881 to 1652
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genres            2407 non-null   object 
 1   id                2407 non-null   int64  
 2   keywords          2407 non-null   object 
 3   overview          2407 non-null   object 
 4   popularity        2407 non-null   float64
 5   release_date      2407 non-null   object 
 6   spoken_languages  2407 non-null   object 
 7   title             2407 non-null   object 
 8   vote_average      2407 non-null   float64
 9   vote_count        2407 non-null   int64  
 10  cast              2407 non-null   object 
 11  crew              2407 non-null   object 
 12  score             2407 non-null   float64
dtypes: float64(3), int64(2), object(8)
memory usage: 263.3+ KB


## Adjust columns format

In [10]:
from ast import literal_eval
features=['genres','keywords','spoken_languages','cast','crew']
for feature in features:
    movies_list[feature]=movies_list[feature].apply(literal_eval)

In [14]:
def get_director(x):
    for i in x:
        if i['job']=='Director':
            return i['name']
        return ''
def get_list(x):
    if isinstance(x,list):
        names=[i['name']for i in x]
        return names
    return []

In [15]:
movies_list['director']=movies_list['crew'].apply(get_director)
features=['genres','keywords','spoken_languages','cast']
for feature in features:
    movies_list[feature]=movies_list[feature].apply(get_list)


In [19]:
movies_list=movies_list.drop(columns=['overview','popularity','vote_average','vote_count','crew'])
movies_list.head(5)

Unnamed: 0,genres,id,keywords,release_date,spoken_languages,title,cast,score,director
1881,"[Drama, Crime]",278,"[prison, corruption, police brutality, prison ...",1994-09-23,[English],The Shawshank Redemption,"[Tim Robbins, Morgan Freeman, Bob Gunton, Clan...",8.432957,
3337,"[Drama, Crime]",238,"[italy, love at first sight, loss of father, p...",1972-03-14,"[English, Italiano, Latin]",The Godfather,"[Marlon Brando, Al Pacino, James Caan, Richard...",8.311498,Francis Ford Coppola
662,[Drama],550,"[support group, dual identity, nihilism, rage ...",1999-10-15,[English],Fight Club,"[Edward Norton, Brad Pitt, Meat Loaf, Jared Le...",8.246223,
3232,"[Thriller, Crime]",680,"[transporter, brothel, drug dealer, boxer, mas...",1994-10-08,"[English, Español, Français]",Pulp Fiction,"[John Travolta, Samuel L. Jackson, Uma Thurman...",8.240109,Quentin Tarantino
1818,"[Drama, History, War]",424,"[factory, concentration camp, hero, holocaust,...",1993-11-29,"[Deutsch, Polski, עִבְרִית, English]",Schindler's List,"[Liam Neeson, Ben Kingsley, Ralph Fiennes, Car...",8.186319,


# Clean strings

In [20]:
def clean(x):
    if isinstance(x,list):
        return [str.lower(i.replace(" ","")) for i in x]
    
features=['genres','keywords','spoken_languages','cast']
for feature in features:
    movies_list[feature]=movies_list[feature].apply(clean)
movies_list.head(1)

Unnamed: 0,genres,id,keywords,release_date,spoken_languages,title,cast,score,director
1881,"[drama, crime]",278,"[prison, corruption, policebrutality, prisonce...",1994-09-23,[english],The Shawshank Redemption,"[timrobbins, morganfreeman, bobgunton, clancyb...",8.432957,


# Combine features

In [23]:
def combine(row):
    return ' '.join(row['genres']) + ' ' +' '.join(row['keywords']) + ' ' +' '.join(row['spoken_languages']) + ' ' +' '.join(row['cast'])+' '+row['director']
movies_list['combined']=movies_list.apply(combine,axis=1)
movies_list['combined'].head(5)

1881    drama crime prison corruption policebrutality ...
3337    drama crime italy loveatfirstsight lossoffathe...
662     drama supportgroup dualidentity nihilism ragea...
3232    thriller crime transporter brothel drugdealer ...
1818    drama history war factory concentrationcamp he...
Name: combined, dtype: object

# Counter vector & similarity

In [33]:
count = CountVectorizer(stop_words='english')
count_mat=count.fit_transform(movies_list['combined'])

cosine_sim=cosine_similarity(count_mat,count_mat)
## reset indices
movies_list=movies_list.reset_index()
indices=pd.Series(movies_list.index,index=movies_list['title'])


# get recommendations

In [42]:
def get_rec(title,cosine_sim=cosine_sim):
    if title in movies_list.values :
        idx=indices[title]
        sim_score=list(enumerate(cosine_sim[idx]))
        sim_score=sorted(sim_score,key=lambda x:x[1],reverse =True)
        sim_score=sim_score[1:11]
        movies_indices=[i[0] for i in sim_score]
        return movies_list.iloc[movies_indices]
    return []

In [45]:
get_rec('Fight Club')['index']

738       45
19      1553
212      946
2084     224
288      421
954     1891
688     1512
880     3068
15      3057
355     3380
Name: index, dtype: int64

# Export the model


In [None]:
import pickle
