In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

pd.options.display.max_columns = None
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer #For tf-idf feature matrix
from sklearn.metrics.pairwise import linear_kernel #For pairwise product of two matrices

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
stop = stopwords.words('english')    
path = 'drive/MyDrive/rs_data/'
name_basics = pd.read_csv(path + 'name_basics.tsv',sep='\t',low_memory=False)
title_basics = pd.read_csv(path + 'title_basics.tsv',sep='\t',low_memory=False)
title_ratings = pd.read_csv(path + 'title_ratings.tsv',sep = '\t',low_memory=False)
title_crew = pd.read_csv(path + 'title_crew.tsv',sep = '\t',low_memory=False)

In [None]:
name_basics.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0072308,tt0031983,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0038355,tt0037382,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0056404,tt0057345,tt0049189,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0078723,tt0080455,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0060827,tt0050986,tt0050976"


In [None]:
title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [None]:
title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1702
1,tt0000002,6.1,210
2,tt0000003,6.5,1458
3,tt0000004,6.2,123
4,tt0000005,6.2,2260


In [None]:
title_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [None]:
# Preprocessing data
#Replacing unknown values with NaN 
name_basics.replace(to_replace = r'\N', value = np.nan, inplace = True)
title_basics.replace(to_replace = r'\N', value = np.nan, inplace = True)
title_ratings.replace(to_replace = r'\N', value = np.nan, inplace = True)
title_crew.replace(to_replace = r'\N', value = np.nan, inplace = True)

#print(title_basics)

#Merging title_basics and title_ratings dataframes
title_new = pd.merge(title_basics, title_ratings, on='tconst')

#Dropping columns originalTitle, isAdult, endYear and runtimeMinutes from title_new
title_new.drop(['originalTitle','isAdult','endYear','runtimeMinutes'],axis=1,inplace=True)

#Dropping birthYear and deathYear from name_basics
name_basics.drop(['birthYear','deathYear'],axis=1,inplace=True)

title_new.head()
#print("Name_basics")
#name_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,startYear,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,1894,"Documentary,Short",5.7,1702
1,tt0000002,short,Le clown et ses chiens,1892,"Animation,Short",6.1,210
2,tt0000003,short,Pauvre Pierrot,1892,"Animation,Comedy,Romance",6.5,1458
3,tt0000004,short,Un bon bock,1892,"Animation,Short",6.2,123
4,tt0000005,short,Blacksmith Scene,1893,"Comedy,Short",6.2,2260


In [None]:
#Dropping startYear and genres columns from title_new
title_new.dropna(subset = ['startYear','genres'],inplace = True)
#Merged title_new to title_crew
title_new = pd.merge(title_new,title_crew,on='tconst')

In [None]:
title_new['startYear'] = title_new['startYear'].astype(int)

#Calculating popularity for every movie based on averageRating and number of votes
title_new['Popularity'] = title_new['averageRating']/title_new['averageRating'].mean()+title_new['numVotes']/title_new['numVotes'].mean()

#Retain the movies which have popularity > the mean value
title_new = title_new[title_new['Popularity']>=title_new['Popularity'].mean()]

#Remove movies which have titleType as tvEpisode or short
title_new = title_new[title_new['titleType'] != 'tvEpisode']
title_new = title_new[title_new['titleType'] != 'short']

#Converting primaryTitle to lowercase and joining the remaining words which are not stopwords
title_new['primaryTitle'] = title_new['primaryTitle'].str.lower()
title_new['primaryTitle'] = title_new['primaryTitle'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

#Creating tags column based on primaryTitle, genres, directors and writers
title_new['tags']=title_new['primaryTitle'].map(str)+','+title_new['genres']+','+title_new['directors'].map(str)+','+title_new['writers']

#Dropping the 4 columns
title_new.drop(['primaryTitle','genres','directors','writers'],axis=1,inplace=True)
print(title_new)
title_new.shape

            tconst titleType  startYear  averageRating  numVotes  Popularity  \
971      tt0002130     movie       1911            7.0      2451    3.533768   
1158     tt0002844     movie       1913            7.0      2049    3.120299   
1188     tt0003014     movie       1913            7.0      1045    2.087653   
1193     tt0003037     movie       1913            7.0      1410    2.463067   
1213     tt0003165     movie       1913            7.0      1097    2.141136   
...            ...       ...        ...            ...       ...         ...   
1128791  tt9900782     movie       2019            8.5     14004   15.633431   
1128810  tt9902160     movie       2020            7.0      1923    2.990704   
1128976  tt9908860  tvSeries       2019            7.1      2308    3.401157   
1129071  tt9911196     movie       2020            7.6      1787    2.937638   
1129264  tt9916362     movie       2020            6.3      3218    4.221368   

                                       

(43568, 7)

In [None]:
#Converting dataframe to csv file
title_new.to_csv('newframe.tsv',index=False,sep='\t')
print('-----------------Preprocessing done------------------')

-----------------Preprocessing done------------------


In [None]:
def main(test_title):
    print('----------------Running the recommendation engine-----------------')
    df = pd.read_csv('newframe.tsv',sep='\t')
    title_basics = pd.read_csv(path + 'title_basics.tsv',sep='\t', low_memory=False)

    #Creating new_df which have movies with averagerating>6.5 and startYear>2000
    new_df = df[(df.averageRating >6.5) & (df.startYear >2000)]
    #print(new_df)

    #Computing the TF_IDF matrix based on the term frequency in the 'tags' column
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    #print("tf done")
    tfidf_matrix = tf.fit_transform(new_df['tags'].values.astype('U'))
    T = tfidf_matrix
    #print(T)

    #Similarity matrix using the dot product
    cosine_similarity_matrix = linear_kernel(T,T)
    #print("cosine sim done")

    #Indexing the new_dataframe to accesses the relavant Titles with their IDs
    new_df = new_df.reset_index()
    titles = new_df['tconst']
    indices = pd.Series(new_df.index, index = new_df['tconst'])
    print(indices)
    
    #Input: title id
    #Output: 10 similar movies based on the similarity scores
    def recommend(test_title):
        #print("inside recommend function")
        idx = indices[test_title]
        sim_scores = list(enumerate(cosine_similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        movie_indices = [i[0] for i in sim_scores]
        return titles.iloc[movie_indices]
    
    recommendations = recommend(test_title)

    # Creating lookup table for post_processing
    title_lookup = title_basics.loc[title_basics['tconst'].isin(new_df['tconst'])]
    title_lookup.drop(['originalTitle','isAdult','endYear','runtimeMinutes'],axis=1,inplace=True)
    title_lookup.head()
    title_lookup.to_csv('title_lookup.csv',index=False)
    
    post_process(recommendations)

In [None]:
def post_process(recommendations):
    title_lookup = pd.read_csv('title_lookup.csv',low_memory=False)
    
    #Printing the rows of recommendations
    for r in recommendations:
        row = title_lookup.loc[title_lookup['tconst']==r]
        row.drop(['tconst'],axis=1)
        print('\n')
        print(row)

In [None]:
test = input('Enter tconst of title: ')

main(test)


Enter tconst of title: tt0069049
----------------Running the recommendation engine-----------------
tconst
tt0069049        0
tt0118926        1
tt0118983        2
tt0120679        3
tt0120681        4
             ...  
tt9900092    15347
tt9900782    15348
tt9902160    15349
tt9908860    15350
tt9911196    15351
Length: 15352, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,




          tconst titleType primaryTitle  startYear genres
13038  tt6143422     movie    Dark Wind       2017  Drama


         tconst titleType            primaryTitle  startYear         genres
7502  tt1757050  tvSeries  In the Eye of the Wind       2008  Drama,History


         tconst     titleType                 primaryTitle  startYear genres
9656  tt2806646  tvMiniSeries  That Winter, the Wind Blows       2013  Drama


          tconst titleType primaryTitle  startYear               genres
12310  tt5362988     movie   Wind River       2017  Crime,Drama,Mystery


         tconst titleType                     primaryTitle  startYear  \
2670  tt0460989     movie  The Wind that Shakes the Barley       2006   

         genres  
2670  Drama,War  


         tconst titleType       primaryTitle  startYear       genres
6400  tt1426374     movie  The Wind Journeys       2009  Drama,Music


         tconst titleType    primaryTitle  startYear  \
8163  tt2013293     movie  The Wind Rises  