In [3]:
import pandas as pd
data_credits= pd.read_csv("credits.csv")
metadata= pd.read_csv("movies_metadata.csv",low_memory=False)
metadata['id'] = metadata['id'].apply(pd.to_numeric, errors='coerce')
combined= pd.merge(metadata,data_credits,on='id',how='inner')
combined.head()
combined['overview'].isnull().any()

True

In [4]:

import pandas as pd
import json
from ast import literal_eval
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df=combined
columns=['genres','cast','crew']
def  get_value(x):
    l = []
    if isinstance(x,list):
        for i in x:
               l.append(i['name'])
        return ' '.join(l)
    return []

for column in columns:
    df[column] = df[column].apply(literal_eval)
    df[column] = df[column].apply(get_value)

#converting the columns from json format to string format
use_columns=['title','overview','genres','cast','crew']
df2=pd.DataFrame(df,columns=use_columns)
df2['combined'] = df2[df2.columns[1:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)

In [5]:
combined['overview'] = combined['overview'].fillna("")

In [6]:
#using TfidfVectorizer for extracting the text features
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(analyzer='word',min_df=0,stop_words='english')
tf_idf_matrix= tf_idf.fit_transform(df2['combined'])
from sklearn.metrics.pairwise import linear_kernel
cosine= linear_kernel(tf_idf_matrix, tf_idf_matrix)
ids = pd.Series(df2.index, index=df2['title'])
ids= ids.drop_duplicates()

In [7]:
#content_based_recommendation
def content_based_recommendation(title, cosine=cosine):
    indexes = ids[title]
    similarity = list(enumerate(cosine[indexes]))
    similarity = sorted(similarity, key=lambda x: x[1], reverse=True)
    movie_ids = [i[0] for i in similarity]
    result= df2['title'].iloc[movie_ids]
    result=result[1:100]
    return result

In [8]:
#collaborative filtering with SVD
import pandas as pd
ratings= pd.read_csv("ratings_small.csv")
import surprise
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate

ratings=ratings.drop(['timestamp'],axis=1)
reader=Reader()
algo = SVD()
data = Dataset.load_from_df(ratings,reader=reader)
training_data = data.build_full_trainset()
algo.fit(training_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x125b67ed0>

In [18]:
#Hybrid recommendation 
def hybrid_recommender(user_id,title):
    #obtaining the similar movies from content_based_recommendation
    similar_movies=content_based_recommendation(title)
    
    #creating a dataframe from similar_movies
    movie_id=pd.Series(similar_movies.index.values.tolist())
    title=pd.Series(similar_movies.values)
    df = { 'movieId':movie_id , 'title': title } 
    df_movies = pd.DataFrame(df)
    
    #merging the movies from similar_movies with ratings
    similar_movies_id=pd.merge(df_movies,ratings,how='inner',on='movieId')
    
    #Identifying movies which have not been rated by the current user
    user_new_movies=similar_movies_id[similar_movies_id['userId']!=user_id]
    
    #selecting the unique movies from user_new_movies
    unique_movies= user_new_movies.drop_duplicates('movieId')
    
    colab_result=[]
    
    #For each movieId in unique_movies and for the current user, we predict the estimated ratings from collaborative filtering
    #with SVD
    for i in unique_movies['movieId']:
        colab_result.append(algo.predict(user_id,i)[3])
    
    movie_ratings=pd.DataFrame(colab_result)
    unique_movies.reset_index(drop=True, inplace=True)
    
    #We concatenate the estimated user ratings with the unique_movies to obtain hybrid_rating
    hybrid_rating=pd.concat([unique_movies,movie_ratings],ignore_index=True,axis=1)
    hybrid_rating.columns=['movieId','title','userId','rating','user_rating']
    
    #We sort the hybrid_ratings based on the estimated user ratings
    hybrid_rating.sort_values('user_rating',ascending=False,inplace=True)
    
    #Finally we return the remmondations from hybris recommendation
    return hybrid_rating[['title','user_rating']][:10]

print("The top 10 recommendations of the hybrid reommendation system are")
print(hybrid_recommender(1,'GoldenEye'))
    

The top 10 recommendations of the hybrid reommendation system are
                                                title  user_rating
19                                         The Jackal     3.369538
37                                   The Elephant Man     3.178567
41                              From Russia with Love     3.023502
40  Pirates of the Caribbean: The Curse of the Bla...     3.021528
10                                 Return of the Jedi     2.953069
9            Harry Potter and the Philosopher's Stone     2.948985
23           Harry Potter and the Prisoner of Azkaban     2.918307
15                               The Spy Who Loved Me     2.902927
45                      The First Great Train Robbery     2.808555
22                                         Die Hard 2     2.805611
