In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.gaussian_process.kernels import RBF
from ast import literal_eval
import warnings; warnings.simplefilter('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
mf = pd.read_csv("movies_metadata.csv")
rsf = pd.read_csv("ratings_small.csv")
kf = pd.read_csv("keywords.csv")
cf = pd.read_csv("credits.csv")
lsf = pd.read_csv("links_small.csv")
lsf = lsf[lsf['tmdbId'].notnull()]['tmdbId']
rf = pd.read_csv('ratings.csv')

## Data Cleaning

In [3]:
mf = mf.drop(['belongs_to_collection','tagline','homepage','original_title','poster_path'],axis =1)
mf = mf.drop([19730,29503,35587])

In [4]:
mf["id"] = mf["id"].astype(int)

In [5]:
print(mf['id'].nunique())
mf = mf.merge(cf, on = "id")
mf = mf.merge(kf, on = 'id')
duplicate = (mf[mf['id'].duplicated()])
duplicate

mf.drop(duplicate.index, axis=0,inplace=True)
mf = mf.dropna(subset=['title'])
mf.shape

45433


(45429, 22)

In [6]:
smf = mf[mf["id"].isin(lsf)]
smf.shape
# smf["tagline"] = smf["tagline"].fillna(" ")

(9082, 22)

In [7]:
smf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9082 entries, 0 to 41669
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 9082 non-null   object 
 1   budget                9082 non-null   object 
 2   genres                9082 non-null   object 
 3   id                    9082 non-null   int32  
 4   imdb_id               9082 non-null   object 
 5   original_language     9082 non-null   object 
 6   overview              9070 non-null   object 
 7   popularity            9082 non-null   object 
 8   production_companies  9082 non-null   object 
 9   production_countries  9082 non-null   object 
 10  release_date          9082 non-null   object 
 11  revenue               9082 non-null   float64
 12  runtime               9082 non-null   float64
 13  spoken_languages      9082 non-null   object 
 14  status                9080 non-null   object 
 15  title               

In [8]:
smf['genres'] = smf['genres'].apply(literal_eval)
smf['crew'] = smf['crew'].apply(literal_eval)
smf['genre_size'] = smf['genres'].apply(lambda x:len(x))
smf['genre_size']

0        3
1        3
2        2
3        3
4        1
        ..
40952    1
41172    2
41225    4
41391    5
41669    2
Name: genre_size, Length: 9082, dtype: int64

In [9]:
def get_genre(x):
    for a in x:
        return a['name']
    return np.nan

In [10]:
smf['genre'] = smf['genres'].apply(get_genre)
smf['genre'] = smf['genre'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smf['genre'] = smf['genre'].apply(lambda x: [x])
smf['genre']

0          [animation]
1          [adventure]
2            [romance]
3             [comedy]
4             [comedy]
             ...      
40952          [drama]
41172       [thriller]
41225      [adventure]
41391         [action]
41669    [documentary]
Name: genre, Length: 9082, dtype: object

In [11]:
def get_director(x):
    for a in x:
        if a['job'] == 'Director':
            return a['name']
    return np.nan

In [12]:
smf['director'] = smf['crew'].apply(get_director)
smf['director'].reset_index()
smf['director'] = smf['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smf['director'] = smf['director'].apply(lambda x: [x])
smf['director']

0             [johnlasseter]
1              [joejohnston]
2             [howarddeutch]
3           [forestwhitaker]
4             [charlesshyer]
                ...         
40952        [greggchampion]
41172      [tinusureshdesai]
41225    [ashutoshgowariker]
41391          [hideakianno]
41669            [ronhoward]
Name: director, Length: 9082, dtype: object

In [13]:
smf['overview'] = smf['overview'].astype('str')
smf['overview'] = smf['overview'].apply(lambda x: [x])
smf['overview']

# Concatenate all the converted elements into a single string

0        [Led by Woody, Andy's toys live happily in his...
1        [When siblings Judy and Peter discover an ench...
2        [A family wedding reignites the ancient feud b...
3        [Cheated on, mistreated and stepped on, the wo...
4        [Just when George Banks has recovered from his...
                               ...                        
40952    [A man must cope with the loss of his wife and...
41172    [Rustom Pavri, an honourable officer of the In...
41225    [Village lad Sarman is drawn to big, bad Mohen...
41391    [From the mind behind Evangelion comes a hit l...
41669    [The band stormed Europe in 1963, and, in 1964...
Name: overview, Length: 9082, dtype: object

In [14]:
smf['overall'] = smf['genre'] +smf['director'] + smf['overview']
smf['overall'] = smf['overall'].apply(lambda x: ' '.join(x))
smf['overall']

0        animation johnlasseter Led by Woody, Andy's to...
1        adventure joejohnston When siblings Judy and P...
2        romance howarddeutch A family wedding reignite...
3        comedy forestwhitaker Cheated on, mistreated a...
4        comedy charlesshyer Just when George Banks has...
                               ...                        
40952    drama greggchampion A man must cope with the l...
41172    thriller tinusureshdesai Rustom Pavri, an hono...
41225    adventure ashutoshgowariker Village lad Sarman...
41391    action hideakianno From the mind behind Evange...
41669    documentary ronhoward The band stormed Europe ...
Name: overall, Length: 9082, dtype: object

In [15]:
tf = TfidfVectorizer(analyzer = 'word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smf['overall'])

In [16]:
cos_matrix1 = cosine_similarity(tfidf_matrix)
cos_matrix1

array([[1.        , 0.00707227, 0.        , ..., 0.        , 0.        ,
        0.00477122],
       [0.00707227, 1.        , 0.01674929, ..., 0.00262777, 0.00211113,
        0.00414342],
       [0.        , 0.01674929, 1.        , ..., 0.        , 0.00263269,
        0.        ],
       ...,
       [0.        , 0.00262777, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00211113, 0.00263269, ..., 0.        , 1.        ,
        0.00173603],
       [0.00477122, 0.00414342, 0.        , ..., 0.        , 0.00173603,
        1.        ]])

In [17]:
smf = smf.reset_index()
titles = smf['title']
indices = pd.Series(smf.index, index=smf['title'])
print(indices)

title
Toy Story                                                0
Jumanji                                                  1
Grumpier Old Men                                         2
Waiting to Exhale                                        3
Father of the Bride Part II                              4
                                                      ... 
The Last Brickmaker in America                        9077
Rustom                                                9078
Mohenjo Daro                                          9079
Shin Godzilla                                         9080
The Beatles: Eight Days a Week - The Touring Years    9081
Length: 9082, dtype: int64


In [18]:
def get_recommendations(title, smf):
     l1 = []
     idx = indices[title]
     sim_scores = list(enumerate(cos_matrix1[idx]))
     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
     sim_scores = sim_scores[1:31]
     movie_indices = [i[0] for i in sim_scores]
     l1 = list(titles.iloc[movie_indices])
     for i in range(len(l1)):
        smf1 = smf[smf['title'] == l1[i]]
     smf2 = smf[smf["title"].isin(l1)]
     smf2 = smf2.sort_values(by=['vote_average'],ascending=False)
     smf2 = smf2[["id","title","vote_average","vote_count","director","genre"]]  
     return smf2

In [19]:
get_recommendations('Rustom',smf)

Unnamed: 0,id,title,vote_average,vote_count,director,genre
4745,26246,Incident at Oglala,8.2,3.0,[michaelapted],[documentary]
874,521,Dial M for Murder,7.9,539.0,[alfredhitchcock],[crime]
3814,269,Breathless,7.7,322.0,[jean-lucgodard],[drama]
3034,93,Anatomy of a Murder,7.7,207.0,[ottopreminger],[crime]
2460,24226,The Verdict,7.4,132.0,[sidneylumet],[drama]
5676,17801,The Letter,7.4,42.0,[williamwyler],[crime]
2005,1847,The Long Goodbye,7.3,112.0,[robertaltman],[thriller]
2351,32255,The Palm Beach Story,7.3,46.0,[prestonsturges],[comedy]
5887,17208,Paradise Lost 2: Revelations,7.2,26.0,[joeberlinger],[documentary]
5056,16227,Dark Passage,7.2,81.0,[delmerdaves],[crime]


## Collaborative Filtering

In [None]:
count=0
display(rf)
print(len(rf['movieId'].unique()))
for mov in rf['movieId'].unique():
    for movie in smf['id'].unique():
        if(movie==mov):
            count+=1
print(count)
display(rf['movieId'].nunique())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


45115


In [None]:
# mr=pd.DataFrame(columns =['movieId', 'ratings'], index = [x for x in range(len(rsf['movieId'].unique())-1)])
# i=0
# for mov in rsf['movieId'].unique():
#     mr['movieId'][i]=mov
#     mr['ratings'][i]=len(rsf.loc[rsf['movieId']==mov])
#     i+=1
# print(mr['ratings'].max())


In [None]:
tsmf = smf['id']
print(len(tsmf.unique()))
rf = rf[rf['movieId'].isin(tsmf)]
rf.shape
print(len(rf['movieId'].unique()))
rf.info()


In [None]:
movie_ratings=pd.DataFrame(columns =['movieId', 'ratings'], index = [x for x in range(len(rf['movieId'].unique())-1)])
i=0
for mov in rf['movieId'].unique():
    movie_ratings['movieId'][i]=mov
    movie_ratings['ratings'][i]=len(rf.loc[rf['movieId']==mov])
    i+=1
    
movie_ratings=movie_ratings[movie_ratings['ratings']>20]
display(movie_ratings)

In [None]:
col_movie_ratings=movie_ratings['movieId']
rf = rf[rf['movieId'].isin(col_movie_ratings)]
rf=rf[rf['userId']<10000]
movie_ratings=movie_ratings[movie_ratings['movieId'].isin(rf['movieId'])]
display(rf)

In [None]:
u_m_matrix = rf.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

display(u_m_matrix)
u_m_df=u_m_matrix.reset_index()
for movie in u_m_matrix:
    print(movie)

In [None]:
nu_m_matrix = u_m_matrix.subtract(u_m_matrix.mean(axis=1), axis= 0)
display(nu_m_matrix)

In [None]:
similarity=nu_m_matrix.T.corr()
display(similarity[1])

In [None]:
similar_users= similarity[similarity[1] >0.3][1].sort_values(ascending= False)[:]
   # return similar_user
display(similar_users)
su_df=similar_users.reset_index()
su_df['similarity']=su_df[1]
su_df.drop(1,axis=1,inplace=True)
display(su_df)
for movie in u_m_matrix:
    su_df[movie]=u_m_df[u_m_df['userId'].isin(su_df['userId'])][movie]
su_df.fillna(0, inplace= True)
su_df.drop(index=0,axis=0,inplace= True)
display(su_df)



In [None]:
def getrating(target_user):
    iumdf=u_m_df.loc[u_m_df['userId']==target_user]
    similar_users= similarity[similarity[target_user]>0.3][target_user].sort_values(ascending= False)[:]
    su_df=similar_users.reset_index()
    su_df['similarity']=su_df[target_user]
    su_df.drop(target_user,axis=1,inplace=True)
    
    for movie in u_m_matrix:
        su_df[movie]=u_m_df[u_m_df['userId'].isin(su_df['userId'])][movie]*(su_df['similarity'])
    su_df.fillna(0, inplace=True)
    
    for movie in u_m_matrix:
        iumdf.fillna(0,inplace= True)
        umlist=list(iumdf[movie])
        if(umlist[0]==0):
            if((su_df[su_df[movie]!=0]['similarity'].sum())!=0):
                iumdf[movie]= su_df[movie].sum()/(su_df[su_df[movie]!=0]['similarity'].sum())
            else:
                iumdf[movie]=0
    return iumdf
            

In [None]:
df3 = getrating(5575)
display(df3)

In [None]:
smf = smf[smf["id"].isin(movie_ratings['movieId'])]
display(smf[smf['id']==2058]['title'])
print(movie_ratings['movieId'].nunique())


In [None]:
df1 = getrating(1)
display(movie_ratings)
df = get_recommendations('Avatar' , smf)
df['user_ratings']=0
display(df)
i=0
print(df['user_ratings'][2058])
# for index in df.index:
    

In [None]:
display(df1)

In [None]:
df['user_ratings'][140]=df1[140]
display(df)

## Hybrid system

In [None]:
def user_recommender (user_id,title):
    df1 = getrating(user_id)
    df = get_recommendations(title , smf)
    df['user_ratings']=0
    for movie in df['id']:
        df['user_ratings'][movie]=df1[movie]
    return df

In [None]:
getrating(1)

In [None]:
user_recommender(1, 'Avatar')