In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import string
from scipy.sparse import csr_matrix
pd.set_option('display.max_rows', None)
np.set_printoptions(edgeitems=100)
np.core.arrayprint._line_width = 200
from sklearn.neighbors import NearestNeighbors
import datetime
from datetime import datetime 
import regex as re




## Collaborative filtering ##

We will first develop the **item-based filtering system**.
1. Data **exploration**
2. Data **cleaning**
3. Features **plotting**
4. System builder

### Data exploration ###

Uploading data set with ratings and movie info

In [2]:
ratings=pd.read_csv("Datasets/u.data",sep="\t",header=None,engine="python")
movies_raw=pd.read_csv("Datasets/u.item",sep="\|",header=None,engine="python")

Giving names to columns in both datasets.

In [3]:
rat_columns=["user_id","movie_id","rating","timestamp"]
ratings.columns=rat_columns
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
movies_colummns=["movie_id","title"]
movies=movies_raw[[0,1]]
movies.columns=movies_colummns
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


**Cleaning** title column by getting rid of the **release date**

In [5]:
movies["title"].replace(".\d+.*","",regex=True,inplace=True)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [6]:
#Checking that the regex has been inplaced correctly
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story
1,2,GoldenEye
2,3,Four Rooms
3,4,Get Shorty
4,5,Copycat


**Merging** the two dataframes to have movies and ratings in the same table,dropping missing values in title and changing timestamp to datetime

In [7]:
movies_with_users=movies.merge(ratings,how="left",on="movie_id")
movies_with_users["timestamp"]=pd.to_datetime(movies_with_users["timestamp"], unit='s')
movies_with_users["title"]=movies_with_users["title"].dropna()
movies_with_users["title"]=movies_with_users["title"].str.strip()

movies_with_users.head()

Unnamed: 0,movie_id,title,user_id,rating,timestamp
0,1,Toy Story,308,4,1998-02-17 17:28:52
1,1,Toy Story,287,5,1997-09-27 04:21:28
2,1,Toy Story,148,4,1997-10-16 16:30:11
3,1,Toy Story,280,4,1998-04-04 14:33:46
4,1,Toy Story,66,3,1997-12-31 20:48:44


#### Explore a little bit and check *popularity* and average rating

In [8]:
movies_with_ratings=pd.DataFrame(round(movies_with_users.groupby(["movie_id","title"])["rating"].mean()))
movies_with_ratings["total number of ratings"]=movies_with_users.groupby(["movie_id","title"])["rating"].count()

In [9]:
movies_with_ratings.reset_index(inplace=True)

In [10]:
movies_with_ratings.head()

Unnamed: 0,movie_id,title,rating,total number of ratings
0,1,Toy Story,4.0,452
1,2,GoldenEye,3.0,131
2,3,Four Rooms,3.0,90
3,4,Get Shorty,4.0,209
4,5,Copycat,3.0,86


### Split the data into train and test set

In [11]:
from sklearn.model_selection import train_test_split

movies_with_users_train, movies_with_users_test = train_test_split(movies_with_users,test_size=0.2)

# training datasets to DataFrame again to manipulate them
movies_with_users_train = pd.DataFrame(movies_with_users_train, columns= movies_with_users.columns)
movies_with_users_test = pd.DataFrame(movies_with_users_test, columns= movies_with_users.columns)

In [28]:
len(list(movies_with_users_test["user_id"].unique()))

942

 ### Make a movie matrix that with the training set in order to use it for the models

In [12]:
movies_matrix = movies_with_users_train.pivot_table(index='user_id',columns='title',values='rating').fillna(0)
movies_matrix.head()


title,Unnamed: 1_level_0,'Til There Was You,1,2 Days in the Valley,3 Ninjas: High Noon At Mega Mountain,8,8 Heads in a Duffel Bag,8 Seconds,A Chef in Love,Above the Rim,...,Yankee Zulu,Year of the Horse,You So Crazy,Young Frankenstein,Young Guns,Young Guns II,"Young Poisoner's Handbook, The",Zeus and Roxanne,unknown,Á köldum klaka (Cold Fever)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


Build a **sparse matrix** and initialize **nearest neighbors model**

In [13]:
movies_matrixT=movies_matrix.T
movies_sparse_matrix=csr_matrix(movies_matrixT)

In [14]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(movies_sparse_matrix)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

### Building the recommender system functions ###

Get recommended movies using **correlation** between them

In [15]:

def top_recommendations_correlation (movie): 
    user_rating = movies_matrix[movie] #get the rating by each user for the movie input

    
    #get the correlation with respect to other movies and drop NaN values
    similar_movie = movies_matrix.corrwith(user_rating)
    corr_with_movie = pd.DataFrame(similar_movie,columns=["Correlation"])
    corr_with_movie.dropna(inplace=True)
    corr_with_movie.reset_index(inplace=True)
    
    #consider only movies that have more than 50 ratings
    corr_with_movie = corr_with_movie.merge(movies_with_ratings,how="left",on="title")
    corr_with_movie = corr_with_movie[corr_with_movie["total number of ratings"]>50].sort_values("Correlation",ascending=False)
    
    #return the top 10 correlated movies
    print("Top recommendations for you based on the movie inputed based on correlation:\n ")
    for i,j in zip(list(corr_with_movie.head(11)["title"]),list(corr_with_movie.head(11)["Correlation"])):
        print(i +" with correlation "+ str(j))
        
    
        
             
   

In [16]:
top_recommendations_correlation("Cinderella")

Top recommendations for you based on the movie inputed based on correlation:
 
Cinderella with correlation 0.9999999999999998
Snow White and the Seven Dwarfs with correlation 0.5219765619156463
Dumbo with correlation 0.5020560618712248
Pinocchio with correlation 0.48333511875362345
Fantasia with correlation 0.47390461072514
Beauty and the Beast with correlation 0.4159099071098829
Lion King, The with correlation 0.3970525335700653
Alice in Wonderland with correlation 0.3872118581553545
E.T. the Extra-Terrestrial with correlation 0.3861766313247582
Mary Poppins with correlation 0.37695022567800107
Sword in the Stone, The with correlation 0.3748025627449714


Get recommended movies using collaborative filtering using **nearest neighbours**

In [17]:
def top_recommendations_Knn(movie):
    
    # Find similar movies (nearer to the selected movie) using kneighbors
    distances, indices = model_knn.kneighbors(movies_matrixT[movies_matrixT.index==movie].values.reshape(1, -1), n_neighbors=11)

    # Convert 'distances' array into 1-D array with flatten
    print("Top recommendations for you based on the movie inputed based on nearest neighbours system: \n ")
    for i in range(0, len(distances.flatten())):
        print("{0}: {1}, with distance of {2}:".format(i, movies_matrixT.index[indices.flatten()[i]], distances.flatten()[i]))

In [24]:
top_recommendations_Knn("Cinderella")

Top recommendations for you based on the movie inputed based on nearest neighbours system: 
 
0: Cinderella, with distance of 0.0:
1: Snow White and the Seven Dwarfs, with distance of 0.4196504700951331:
2: Dumbo, with distance of 0.44702332882393647:
3: Fantasia, with distance of 0.4641670912756507:
4: Pinocchio, with distance of 0.47011082129529547:
5: Beauty and the Beast, with distance of 0.5076315862143317:
6: E.T. the Extra-Terrestrial, with distance of 0.5219999572190519:
7: Lion King, The, with distance of 0.5229808538873402:
8: Mary Poppins, with distance of 0.545702115516484:
9: Jurassic Park, with distance of 0.5465780495756186:
10: Sound of Music, The, with distance of 0.5572192481188428:


## Evaluating the models

In order to evaluate both models, we will try to recommend 5 movies to a sample of 5 users based on the **first movie they rated** and check if the have seen and rated in the **future**

First, we create a dataframe with the **sorted** users and timestamp of their ratings

In [19]:
sorted_movies=movies_with_users.sort_values(by=["user_id","timestamp"])


Now we take a sample of the **test set**

In [20]:
users_sample=movies_with_users_test

Let's define the functions to evaluate both systems

1. Correlation system

In [21]:
def evaluate_correlation (user, movie): 
    user_rating = movies_matrix[movie] #get the rating by each user for the movie input

    
    #get the correlation with respect to other movies and drop NaN values
    similar_movie = movies_matrix.corrwith(user_rating)
    corr_with_movie = pd.DataFrame(similar_movie,columns=["Correlation"])
    corr_with_movie.dropna(inplace=True)
    corr_with_movie.reset_index(inplace=True)
    
    #consider only movies that have more than 50 ratings
    corr_with_movie = corr_with_movie.merge(movies_with_ratings,how="left",on="title")
    corr_with_movie = corr_with_movie[corr_with_movie["total number of ratings"]>50].sort_values("Correlation",ascending=False)
    
    #return the top 10 correlated movies
    recommended_movies_corr=list(corr_with_movie.head(10)["title"])
    
    movies_user_seen=list(movies_with_users[movies_with_users["user_id"]==user]["title"])

    matched_movies_corr = []
    for movie in recommended_movies_corr:
        if movie in movies_user_seen:
            matched_movies_corr.append(movie)
    
    #print(len(matched_movies_corr))
       
    return matched_movies_corr
    #print(movies_user_seen)
    
    
    #c=0
    #for i,j in zip(list(corr_with_movie.head(10000)["title"]),list(corr_with_movie.head(10000)["Correlation"])):
    #    if c==0:
    #        c=1
    #        print("\n Recommendations for user " + str(user) + " based on his first rated movie " + i + " using correlation:\n")   
    #    else:
    #        
    #        print(i +" with correlation "+ str(j))

2. Nearest neighbours system

In [22]:
def evaluate_Knn(user,movie):
    
    # Find similar movies (nearer to the selected movie) using kneighbors
    distances, indices = model_knn.kneighbors(movies_matrixT[movies_matrixT.index==movie].values.reshape(1, -1), n_neighbors=11)

    recommended_movies_knn=movies_matrixT.index[indices.flatten()]  # Convert 'distances' array into 1-D array with flatten

    movies_user_seen=list(movies_with_users[movies_with_users["user_id"]==user]["title"])

    
    matched_movies_knn = []
    for movie in recommended_movies_knn:
        if movie in movies_user_seen:
            matched_movies_knn.append(movie)
    #print(len(matched_movies_knn))

    return matched_movies_knn

    
    #for i in range(0, len(distances.flatten())):
    #    if i == 0:
    #        print("\n Recommendations for user " + str(user) + " based on his first rated movie " + movie + " using nearest neighbours model:\n")   

    #    else:
    #        print("{0}: {1}, with distance of {2}:".format(i, movies_matrixT.index[indices.flatten()[i]], distances.flatten()[i]))

#### Now let's create the loop to evaluate the whole test set

In [23]:
matches_corr= []
matches_knn=[]

for user in users_sample["user_id"][:10]:
    
    # We identify  the **first rated** movie of this users
    
    matched_movies_corr=evaluate_correlation(sorted_movies[sorted_movies["user_id"]==user].iloc[0]["user_id"],sorted_movies[sorted_movies["user_id"]==user].iloc[0]["title"])
   
    matched_movies_knn=evaluate_Knn(sorted_movies[sorted_movies["user_id"]==user].iloc[0]["user_id"],sorted_movies[sorted_movies["user_id"]==user].iloc[0]["title"])
    
    
    for match_corr in matched_movies_corr:
        matches_corr.append(len(matched_movies_corr))
    for match_knn in matched_movies_knn:
        matches_knn.append(len(matched_movies_knn))
            
matches_corr=np.array(matches_corr)
matches_knn=np.array(matches_knn)
print(len(matches_corr)) #Number of users that matched!!
print(len(matches_knn))  #Number of users that matched!!
total_matches_corr=sum(matches_corr) 
total_matches_knn=sum(matches_knn)
print(total_matches_corr) #Total matched movies
print(total_matches_knn)  #Total matched movies
    


43
59
243
425
