To download dataset go [here](https://grouplens.org/datasets/movielens/25m/)

In [1]:
#Dataframe manipulation library
import pandas as pd

#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt

from tqdm import tqdm
import numpy as np
import pickle 

In [2]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('data1/movies.csv')

#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('data1/ratings.csv', usecols=['userId', 'movieId'])

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId
0,1,169
1,1,2471
2,1,48516
3,2,2571
4,2,109487


In [4]:
# Find the number of samples using shape function
ratings_df.shape

(22884377, 2)

In [5]:
#Using regular expressions to find a year stored between parentheses
# Include the parantheses so that we don't remove movies titles that have years in them
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

# extract the year without parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

# Remove the years from the 'title' column using regular expressions
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [6]:
#Every genre is separated by a | so we simply have to call the split function on |
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [7]:
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
        
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Numbers of users
len(ratings_df.userId.unique())

247753

In [9]:
ratings_df.userId.max()

247753

In [10]:
# Number of movies
len(ratings_df.movieId.unique())

33670

In [11]:
# Calculate the views and view_ratio (views divided by max views)
view_count = pd.DataFrame(ratings_df.groupby('movieId').size())
view_count.columns = ['counts']
view_count['ratio'] = view_count['counts']/(view_count['counts'].max())
view_count.reset_index(inplace=True)
view_count.head(2)

Unnamed: 0,movieId,counts,ratio
0,1,60424,0.743259
1,2,23950,0.294602


In [12]:
view_ratio_dict = {}
for index, row in view_count.iterrows():
    view_ratio_dict[row['movieId']] = row['ratio']

In [13]:
# Create a copy of views info and map the corresponding view ratio
mod_views = ratings_df.copy()

In [14]:
movieid = np.array(mod_views['movieId'])

In [15]:
view_ratio = np.array([view_ratio_dict[i] for i in movieid])

In [16]:
mod_views['view_ratio'] = view_ratio
mod_views.head()

Unnamed: 0,userId,movieId,view_ratio
0,1,169,0.048428
1,1,2471,0.072525
2,1,48516,0.225522
3,2,2571,0.797456
4,2,109487,0.125578


In [17]:
ratings_df = ratings_df[:20000000]

In [18]:
user_to_movie_hashmap = {}

In [19]:
#### Mapping the user ids to movie ids to find out the details of movies viewed by each user

for user in tqdm(np.array(ratings_df.userId.unique())):
    user_to_movie_hashmap[user] = list(ratings_df[ratings_df['userId']==user]['movieId'].values)

  1%|▊                                                                         | 2520/216052 [00:47<1:06:29, 53.52it/s]


KeyboardInterrupt: 

In [None]:
# Ignore
# pd.Series([ratings_df[ratings_df['userId']==user]['movieId'].values for user in np.array(ratings_df.userId.unique())], 
#            index=ratings_df.userId.unique())

In [65]:
PYTHONHASHSEED=0 

In [66]:
# d = user_to_movie_hashmap
# split = len(d)//2
# d1 = dict(list(d.items())[:split])
# d2 = dict(list(d.items())[split:])

In [67]:
# with open('user_movie_hash_20mil_partA.pickle', 'wb') as handle:
#     pickle.dump(d1, handle)

In [68]:
# with open('user_movie_hash_20mil_partB.pickle', 'wb') as handle:
#     pickle.dump(d2, handle)

In [None]:
len(d)

In [69]:
with open('user_movie_hash_20mil_partA.pickle', 'rb') as handle:
    dump1 = pickle.load(handle)

In [70]:
with open('user_movie_hash_20mil_partB.pickle', 'rb') as handle:
    dump2 = pickle.load(handle)

In [71]:
# Merging two dictionaries.  Using this we first pass all the elements of the first dictionary into the
# third one and then pass the second dictionary into the third. This will replace the duplicate keys of 
# the first dictionary. 
user_to_movie_hashmap = {**dump1, **dump2}

In [72]:
len(user_to_movie_hashmap)

216052

In [73]:
userid = int(input("Enter the user id :"))

Enter the user id :1


In [74]:
# User2
#userid = 250
movie_watched = user_to_movie_hashmap[userid]

In [35]:
#movie_watched = [1246, 2739, 3408, 8880, 3010, 3011]

In [75]:
#Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(movie_watched)]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
167,169,Free Willy 2: The Adventure Home,"[Adventure, Children, Drama]",1995,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2387,2471,Crocodile Dundee II,"[Action, Adventure, Comedy]",1988,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11372,48516,"Departed, The","[Crime, Drama, Thriller]",2006,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
#Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)

#Dropping unnecessary columns to save memory, computation and to avoid issues
userGenreTable = userMovies.drop(['movieId','title','genres','year'], 1)
userGenreTable.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
# Set post id as index and drop the title column
table_matrix = moviesWithGenres_df.set_index('movieId')
table_matrix.drop(columns=['title'], inplace=True)
table_matrix.head(2)

Unnamed: 0_level_0,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# Generate the user profile on basis of movies watched
user_movie_view_count = view_count[view_count['movieId'].isin(movie_watched)]['ratio']
user_movie_view_count = np.array(user_movie_view_count)
#user_post_view_count = user_post_view_count[:,np.newaxis]

# Take a dot product to get the weights for each category i.e. UserProfile
user_profile = userGenreTable.transpose().dot(user_movie_view_count)
user_profile

Adventure             0.120953
Animation             0.000000
Children              0.048428
Comedy                0.072525
Fantasy               0.000000
Romance               0.000000
Drama                 0.273950
Action                0.072525
Crime                 0.225522
Thriller              0.225522
Horror                0.000000
Mystery               0.000000
Sci-Fi                0.000000
IMAX                  0.000000
Documentary           0.000000
War                   0.000000
Musical               0.000000
Western               0.000000
Film-Noir             0.000000
(no genres listed)    0.000000
dtype: float64

In [79]:
# Multipy the table_matrix by weights in User Profile and take the weighted average
recommendation_table = (user_profile*table_matrix).sum(axis=1)/user_profile.sum()
# Sort the values in descending order
recommendation_table = recommendation_table.sort_values(ascending=False)
# Have a glance
recommendation_table.head()

movieId
122787    0.953409
81132     0.953409
64645     0.953409
4620      0.883635
115333    0.883635
dtype: float64

In [80]:
# Movies recommended for the given user by content based filtering
result1 = movies_df.loc[movies_df['movieId'].isin(recommendation_table.head(50).keys())]
result1.reset_index(drop=True, inplace=True)
result1.head(5)

Unnamed: 0,movieId,title,genres,year
0,20,Money Train,"[Action, Comedy, Crime, Drama, Thriller]",1995
1,145,Bad Boys,"[Action, Comedy, Crime, Drama, Thriller]",1995
2,459,"Getaway, The","[Action, Adventure, Crime, Drama, Romance, Thr...",1994
3,1034,Freeway,"[Comedy, Crime, Drama, Thriller]",1996
4,1432,Metro,"[Action, Comedy, Crime, Drama, Thriller]",1997


In [81]:
# Actual movies viewed by the user for comparison
actual_users_movies = movies_df.loc[movies_df['movieId'].isin(movie_watched)]
actual_users_movies.head()

Unnamed: 0,movieId,title,genres,year
167,169,Free Willy 2: The Adventure Home,"[Adventure, Children, Drama]",1995
2387,2471,Crocodile Dundee II,"[Action, Adventure, Comedy]",1988
11372,48516,"Departed, The","[Crime, Drama, Thriller]",2006


## Collaborative filtering

In [82]:
#Filtering out users that have watched movies that the input user has watched and storing it
usersubset = mod_views[mod_views['movieId'].isin(movie_watched)]
usersubset.head(2)

Unnamed: 0,userId,movieId,view_ratio
0,1,169,0.048428
1,1,2471,0.072525


In [83]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
usersubsetgroup = usersubset.groupby('userId')

In [84]:
#Sorting it so users with more movies in common with the input will have priority
usersubsetgroup = sorted(usersubsetgroup, key= lambda x:len(x[1]), reverse=True)

In [85]:
# Find out the required details of input users from the given movie ids
input_group = mod_views[mod_views['userId'] == userid]
input_group = input_group.sort_values(by='view_ratio')
input_group.head()

Unnamed: 0,userId,movieId,view_ratio
0,1,169,0.048428
1,1,2471,0.072525
2,1,48516,0.225522


In [86]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}
for name, group in usersubsetgroup:
    n_views = len(group)
    group = group.sort_values(by='view_ratio') 
    temp_df = input_group[input_group['movieId'].isin(group['movieId'].tolist())]
    temp_ratio = temp_df['view_ratio'].tolist()
    group_ratio = group['view_ratio'].tolist()
    
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in temp_ratio]) - pow(sum(temp_ratio),2)/float(n_views)
    Syy = sum([i**2 for i in group_ratio]) - pow(sum(group_ratio),2)/float(n_views)
    Sxy = sum( i*j for i, j in zip(temp_ratio, group_ratio)) - sum(temp_ratio)*sum(group_ratio)/float(n_views)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0  

In [None]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarity_index']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

In [None]:
topUsers = pearsonDF.sort_values(by='similarity_index', ascending=False)
topUsersRating = topUsers.merge(mod_views, left_on='userId', right_on='userId', how='inner')
topUsersRating.head(2)

In [None]:
#Multiplies the similarity by the view_ratio
topUsersRating['weighted_rating'] = topUsersRating['similarity_index']*topUsersRating['view_ratio']
topUsersRating.head(2)

In [None]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarity_index','weighted_rating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head(3)

In [None]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.reset_index(drop=True, inplace=True)
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

In [None]:
# Movies recommended to given user by Collaborative filtering
result2 = movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'].head(50).tolist())]
result2.reset_index(drop=True, inplace=True)
result2.head(5)

In [None]:
# Hybrid systems overcomes much of the disadvantages of the above two
movies_to_recommend = result2.head(5).append(result1.head(5))
movies_to_recommend.reset_index(drop=True, inplace=True)
movies_to_recommend

In [None]:
# Actual movies viewed by the user for comparison
actual_users_movies = movies_df.loc[movies_df['movieId'].isin(movie_watched)]
actual_users_movies