In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [191]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.metrics import mean_squared_error

In [42]:
from math import sqrt
from scipy import sparse

In [32]:
df = pd.read_csv('./datasets/merged_users+movies.csv')
movies = pd.read_csv('./datasets/movies.csv')

## Let's start with building some basic Recommendation Engines

1. Content-Based

2. Collaborative Filtering

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


By using the TfidfVectorizer, we skip having to normalize the scores for each `feature_name` once we put our text through the vectorizer. Instead, we can pass that directly over into our `cosine_similarity` method to get the similarity scores, which can be treated as the Pearson Correlation Coefficient.

In [5]:
tvec = TfidfVectorizer(lowercase=True, # using vectorizer to set all words to lowercase
                      analyzer='word', # we will vectorize based on words
                      stop_words=None, # every single genre should be important, 
                                       # in the next iteration we can try using 
                                       # stopwords
                      ngram_range=(1, 1), # considering that each word in the genre
                                          # was meant to be used as 1 word, we will
                                          # do the same
                      min_df=0         # we want to avoid missing out on any word in
                                       # the genre tagged to the movie
                      )

In [6]:
# vectorizing the genres in movies
tvec_genres = tvec.fit_transform(movies['genres'])
tvec_genres.shape

(9742, 24)

In [7]:
movies.shape

(9742, 3)

In [8]:
dense_matrix = pd.DataFrame(
    tvec_genres.todense(),
    columns=tvec.get_feature_names_out(),
    index=movies['title'],
)

In [9]:
sim_matrix = cosine_similarity(dense_matrix)
movies_sim = pd.DataFrame(
    sim_matrix,
    columns=dense_matrix.index,
    index=dense_matrix.index)

Now let's take a look at the first 5 rows of our similarity matrix. 

In [10]:
movies_sim.head()

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.0,0.813578,0.152769,0.135135,0.267586,0.0,0.152769,0.654698,0.0,0.262413,...,0.360397,0.465621,0.196578,0.516225,0.0,0.680258,0.755891,0.0,0.421037,0.267586
Jumanji (1995),0.813578,1.0,0.0,0.0,0.0,0.0,0.0,0.804715,0.0,0.322542,...,0.0,0.0,0.0,0.0,0.0,0.341376,0.379331,0.0,0.0,0.0
Grumpier Old Men (1995),0.152769,0.0,1.0,0.884571,0.570915,0.0,1.0,0.0,0.0,0.0,...,0.162848,0.0,0.419413,0.0,0.0,0.181883,0.202105,0.0,0.0,0.570915
Waiting to Exhale (1995),0.135135,0.0,0.884571,1.0,0.505015,0.0,0.884571,0.0,0.0,0.0,...,0.144051,0.201391,0.68744,0.0,0.0,0.160888,0.178776,0.466405,0.0,0.505015
Father of the Bride Part II (1995),0.267586,0.0,0.570915,0.505015,1.0,0.0,0.570915,0.0,0.0,0.0,...,0.28524,0.0,0.734632,0.0,0.0,0.318581,0.354002,0.0,0.0,1.0


## Explore the Linear Kernel method to quickly compute cosine similarity, since we have used Tfidf instead of cvec.

In [11]:
# cosine_sim = linear_kernel(tvec_genres, tvec_genres)
# cosine_sim

In [12]:
demo = movies_sim['Toy Story (1995)'].sort_values(ascending=False)
demo[demo>0.95]

title
Toy Story (1995)                                             1.000000
Toy Story 2 (1999)                                           1.000000
Tale of Despereaux, The (2008)                               1.000000
Asterix and the Vikings (Astérix et les Vikings) (2006)      1.000000
Shrek the Third (2007)                                       1.000000
Turbo (2013)                                                 1.000000
Monsters, Inc. (2001)                                        1.000000
The Good Dinosaur (2015)                                     1.000000
Antz (1998)                                                  1.000000
Emperor's New Groove, The (2000)                             1.000000
Moana (2016)                                                 1.000000
Adventures of Rocky and Bullwinkle, The (2000)               1.000000
Wild, The (2006)                                             1.000000
Inside Out (2015)                                            0.970795
Atlantis: The 

Let's create a function that will return the top `n` number of movies that are similar to the movie that has been input.

In [13]:
def movie_genre_recommender(title, n):
    """This function returns a table, 
    with the recommended movie titles 
    and their respective similarity 
    scores to the input movie.
    
    Accepts a string title, and n, the 
    number of movies to be recommneded."""
    
    if title in movies_sim.index:
        reco_series = movies_sim[title].sort_values(ascending=False).head(n+1)
        df = pd.DataFrame({
            'title':reco_series.index,
            'similarity_score': reco_series.values
        })
        df = df[df['title'] != title]
        df.reset_index(inplace=True, drop=True)
        return df.style.format({'similarity_score':"{:.1%}"})
    else:
        print('Please input a movie title that is in the available list of movies.')

With this, we have a simple, working content based recommendation system. Let's test a few use cases, with varying number of recommended movies.

In [15]:
movie_genre_recommender('Toy Story (1995)', 15)

Unnamed: 0,title,similarity_score
0,Toy Story 2 (1999),100.0%
1,"Tale of Despereaux, The (2008)",100.0%
2,Asterix and the Vikings (Astérix et les Vikings) (2006),100.0%
3,Shrek the Third (2007),100.0%
4,Turbo (2013),100.0%
5,"Monsters, Inc. (2001)",100.0%
6,The Good Dinosaur (2015),100.0%
7,Antz (1998),100.0%
8,"Emperor's New Groove, The (2000)",100.0%
9,Moana (2016),100.0%


In [16]:
movie_genre_recommender('Shrek the Third (2007)', 15)

Unnamed: 0,title,similarity_score
0,Toy Story (1995),100.0%
1,Toy Story 2 (1999),100.0%
2,"Tale of Despereaux, The (2008)",100.0%
3,Asterix and the Vikings (Astérix et les Vikings) (2006),100.0%
4,Turbo (2013),100.0%
5,"Monsters, Inc. (2001)",100.0%
6,The Good Dinosaur (2015),100.0%
7,Antz (1998),100.0%
8,"Emperor's New Groove, The (2000)",100.0%
9,Moana (2016),100.0%


In [21]:
movie_genre_recommender('Matrix, The (1999)', 10)

Unnamed: 0,title,similarity_score
0,"One, The (2001)",100.0%
1,Outland (1981),100.0%
2,Paycheck (2003),100.0%
3,Surrogates (2009),100.0%
4,Firefox (1982),100.0%
5,Garm Wars: The Last Druid (2014),100.0%
6,Chronicle (2012),100.0%
7,Déjà Vu (Deja Vu) (2006),100.0%
8,Hangar 18 (1980),100.0%
9,Insurgent (2015),100.0%


## Some qualitative evalutation of the recommender system.
Pros<br>
Cons<br>
Let's take a look at a simple implementation of the collaborative filtering recommendation system.

# Collaborative Filtering Recommendation System
1. Memory-Based Collaborative Filtering
2. Model-Based Collaborative Filtering

We will be focusing on **Memory-Based** in this notebook.
1. User-to-User Collaborative Filtering
2. Item-to-Item Collaborative Filtering

For this, we are able to use 3 different types of distance similarity metrics. 
1. Jaccard Similarity
2. Cosine Similarity
3. Pearson Similarity

For this simple system, we will continue to use the **cosine similarity**.

Let us start of by first building our User-to-User System.<br>
We will start by dropping a few columns that we will not be needing.
*Since the computing power*

In [87]:
collab_df = df.drop(columns=['timestamp', 'genres', 'movieId'])
collab_df

Unnamed: 0,userId,rating,title
0,1,4.0,Toy Story (1995)
1,5,4.0,Toy Story (1995)
2,7,4.5,Toy Story (1995)
3,15,2.5,Toy Story (1995)
4,17,4.5,Toy Story (1995)
...,...,...,...
100831,610,2.5,Bloodmoon (1997)
100832,610,4.5,Sympathy for the Underdog (1971)
100833,610,3.0,Hazard (2005)
100834,610,3.5,Blair Witch (2016)


In [205]:
train_collab.

Unnamed: 0,userId,rating,title
80568,275,5.0,Body Heat (1981)
50582,295,4.5,"Godfather: Part II, The (1974)"
8344,140,3.0,"Terminator, The (1984)"
99603,606,4.0,8 Women (2002)
71701,182,3.0,Rounders (1998)
...,...,...,...
6265,249,4.0,Reservoir Dogs (1992)
54886,75,3.0,Inside Man (2006)
76820,132,4.5,"Girl, Interrupted (1999)"
860,17,4.5,Braveheart (1995)


In [206]:
test_collab

Unnamed: 0,userId,rating,title,user_based_preds
3957,1,4.0,Dances with Wolves (1990),1.066625
5709,1,5.0,"Adventures of Robin Hood, The (1938)",0.062081
12368,1,4.0,¡Three Amigos! (1986),0.181760
11006,1,4.0,"Honey, I Shrunk the Kids (1989)",0.433476
1146,1,5.0,Desperado (1995),0.399030
...,...,...,...,...
47973,610,4.0,Edge of Tomorrow (2014),0.492656
55577,610,3.5,American Gangster (2007),0.416662
78450,610,5.0,Down by Law (1986),0.082207
93432,610,2.0,Diary of the Dead (2007),0.023544


In [209]:
train_collab, test_collab = train_test_split(collab_df,
                                             test_size=0.2,
                                             random_state=42
                                            )

We will first create a pivot table that we will then use to calculate the cosine similarity for our recommender.

In [210]:
user_pivot = pd.pivot_table(train_collab, index='userId', columns='title', values='rating')
user_pivot.fillna(0, inplace=True)
user_pivot.head()

title,'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [211]:
sparse_user_pivot = sparse.csr_matrix(user_pivot)

In [212]:
user_sim = cosine_similarity(sparse_user_pivot)
user_sim

array([[1.        , 0.01969851, 0.02485907, ..., 0.22149217, 0.07562784,
        0.11881372],
       [0.01969851, 1.        , 0.        , ..., 0.02354623, 0.03895262,
        0.08886015],
       [0.02485907, 0.        , 1.        , ..., 0.00754662, 0.        ,
        0.0178608 ],
       ...,
       [0.22149217, 0.02354623, 0.00754662, ..., 1.        , 0.11617707,
        0.24121963],
       [0.07562784, 0.03895262, 0.        , ..., 0.11617707, 1.        ,
        0.0384606 ],
       [0.11881372, 0.08886015, 0.0178608 , ..., 0.24121963, 0.0384606 ,
        1.        ]])

In [213]:
user_reco_df = pd.DataFrame(data=user_sim, 
                           columns=user_pivot.index,
                           index=user_pivot.index)

In [214]:
user_reco_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.019699,0.024859,0.159338,0.092765,0.080738,0.120346,0.084747,0.048005,0.01036,...,0.031876,0.10716,0.180013,0.061534,0.11126,0.129181,0.191271,0.221492,0.075628,0.118814
2,0.019699,1.0,0.0,0.005192,0.023794,0.015804,0.00927,0.038378,0.0,0.09063,...,0.145743,0.023278,0.007792,0.0,0.0,0.023658,0.018858,0.023546,0.038953,0.08886
3,0.024859,0.0,1.0,0.0026,0.005958,0.003324,0.0,0.005766,0.0,0.0,...,0.005868,0.005595,0.028617,0.0,0.0,0.00834,0.019832,0.007547,0.0,0.017861
4,0.159338,0.005192,0.0026,1.0,0.07563,0.076561,0.084837,0.047112,0.0,0.023463,...,0.067781,0.105625,0.268185,0.052697,0.066748,0.1694,0.094664,0.112981,0.004269,0.083538
5,0.092765,0.023794,0.005958,0.07563,1.0,0.240535,0.065772,0.389419,0.0,0.012978,...,0.044689,0.404571,0.078286,0.205423,0.132733,0.073095,0.174297,0.137727,0.254366,0.042499


In [237]:
test_collab.sort_values('userId', inplace=True)

test_collab['user_based_preds'] = np.nan

row_num = 0

while row_num < len(test_collab):
    user_id = test_collab.iloc[row_num,0]
    user_i_sim = user_reco_df[user_id].drop(user_id)
    user_i_sim = user_i_sim[user_i_sim > 0]
    
    user_i_weights = user_i_sim.values/np.sum(user_i_sim.values)
    user_i_weights
    
    get_ratings_useri = user_pivot.T
    get_ratings_useri = get_ratings_useri[get_ratings_useri[user_id] == 0]
    get_ratings_useri = get_ratings_useri.drop(user_id, axis=1)
    get_ratings_useri = get_ratings_useri[user_i_sim.index]
    
    ratings_useri = np.dot(get_ratings_useri.fillna(0).values, user_i_weights)
    ratings_useri_df = pd.DataFrame(ratings_useri, index=get_ratings_useri.index, columns=['rating'])
    
    temp_df = test_collab[test_collab['userId'] == user_id]
    
    for _ in range(0, len(temp_df)):
        if row_num < len(test_collab):
            try:
                movie_title = test_collab.iloc[row_num, 2]
                test_collab.iloc[row_num, 3] = ratings_useri_df.loc[movie_title, 'rating']
                row_num += 1
            except KeyError:
                test_collab.iloc[row_num, 3] = 0
                row_num += 1

In [238]:
sqrt(mean_squared_error(test_collab['user_based_preds'], test_collab['rating']))

3.2250661900359954

Now let's build the item to item similarity matrix.

We will first create a pivot table that we will then use to calculate the cosine similarity for our recommender.

In [217]:
item_pivot = pd.pivot_table(train_collab, columns='userId', index='title', values='rating')
item_pivot.fillna(0, inplace=True)
item_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Tis the Season for Love (2015),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [247]:
sparse_item_pivot = sparse.csr_matrix(item_pivot)
# sparse_item_pivot = sparse.csr_matrix(mean_center_rows(item_pivot))

In [248]:
item_sim = cosine_similarity(sparse_item_pivot)
item_sim

array([[ 1.        ,  1.        , -0.00164204, ..., -0.00283428,
        -0.00708751, -0.00164204],
       [ 1.        ,  1.        , -0.00164204, ..., -0.00283428,
        -0.00708751, -0.00164204],
       [-0.00164204, -0.00164204,  1.        , ..., -0.00283428,
        -0.00708751, -0.00164204],
       ...,
       [-0.00283428, -0.00283428, -0.00283428, ...,  1.        ,
        -0.01223357, -0.00283428],
       [-0.00708751, -0.00708751, -0.00708751, ..., -0.01223357,
         1.        , -0.00708751],
       [-0.00164204, -0.00164204, -0.00164204, ..., -0.00283428,
        -0.00708751,  1.        ]])

In [249]:
# def mean_center_rows(df):
#     return (df.T - df.mean(axis=1)).T

In [250]:
item_reco_df = pd.DataFrame(data=item_sim, 
                           columns=item_pivot.index,
                           index=item_pivot.index)

In [251]:
item_reco_df.head()

title,'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...All the Marbles (1981),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Hellboy': The Seeds of Creation (2004),1.0,1.0,-0.001642,-0.002254,-0.001642,-0.005363,-0.001642,-0.009714,-0.003657,-0.001642,...,-0.001642,-0.003217,-0.001642,-0.001642,-0.001642,-0.006513,-0.006925,-0.002834,-0.007088,-0.001642
'Round Midnight (1986),1.0,1.0,-0.001642,-0.002254,-0.001642,-0.005363,-0.001642,-0.009714,-0.003657,-0.001642,...,-0.001642,-0.003217,-0.001642,-0.001642,-0.001642,-0.006513,-0.006925,-0.002834,-0.007088,-0.001642
'Salem's Lot (2004),-0.001642,-0.001642,1.0,0.857269,-0.001642,-0.005363,-0.001642,-0.009714,-0.003657,-0.001642,...,-0.001642,-0.003217,-0.001642,-0.001642,-0.001642,-0.006513,-0.006925,-0.002834,-0.007088,-0.001642
'Til There Was You (1997),-0.002254,-0.002254,0.857269,1.0,-0.002254,-0.007364,-0.002254,-0.013337,-0.005021,-0.002254,...,-0.002254,-0.004417,-0.002254,-0.002254,-0.002254,-0.008942,-0.009507,-0.003891,-0.009731,-0.002254
'Tis the Season for Love (2015),-0.001642,-0.001642,-0.001642,-0.002254,1.0,-0.005363,-0.001642,-0.009714,-0.003657,-0.001642,...,-0.001642,-0.003217,-0.001642,-0.001642,-0.001642,-0.006513,-0.006925,-0.002834,-0.007088,-0.001642


In [252]:
test_collab.sort_values('title', inplace=True)

In [253]:
test_collab.sort_values('title', inplace=True)

test_collab['item_based_preds'] = np.nan

row_num = 0

while row_num < len(test_collab):
    try:
        item_name = test_collab.iloc[row_num,2]
        item_name_sim = item_reco_df[item_name].drop(item_name)
        item_name_sim = item_name_sim[item_name_sim > 0]

        item_name_weights = item_name_sim.values/np.sum(item_name_sim.values)

        get_ratings_item_name = item_pivot.T
        get_ratings_item_name = get_ratings_item_name[get_ratings_item_name[user_id] == 0]
        get_ratings_item_name = get_ratings_item_name.drop(item_name, axis=1)
        get_ratings_item_name = get_ratings_item_name[item_name_sim.index]

        ratings_item_name = np.dot(get_ratings_item_name.fillna(0).values, item_name_weights)
        ratings_item_name_df = pd.DataFrame(ratings_item_name, index=get_ratings_item_name.index, columns=['rating'])

        temp_df = test_collab[test_collab['title'] == item_name]

        for _ in range(0, len(temp_df)):
            if row_num < len(test_collab):
                try:
                    user_id = test_collab.iloc[row_num, 0]
                    test_collab.iloc[row_num, 4] = ratings_item_name_df.loc[user_id, 'rating']
                    row_num += 1
                except KeyError:
                    test_collab.iloc[row_num, 4] = 0
                    row_num += 1
    except KeyError:
        test_collab.iloc[row_num, 4] = 0
        row_num += 1

In [246]:
sqrt(mean_squared_error(test_collab['item_based_preds'], test_collab['rating']))

3.6521818368988557

In [254]:
# with mean centering
sqrt(mean_squared_error(test_collab['item_based_preds'], test_collab['rating']))

3.6521818368988557

ABSOLUTELY NO EFFECT OF MEAN CENTERING.

---

In [None]:
# # Function to predict ratings
# def predict(ratings, similarity, type='user'):
#     if type == 'user':
#         mean_user_rating = ratings.mean(axis=1)
#         # Use np.newaxis so that mean_user_rating has same format as ratings
#         ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
#         pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
#     elif type == 'item':
#         pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
#     return pred

In [None]:
# Predict ratings on the training data with both similarity score
# user_prediction = predict(train_data_matrix, user_correlation, type='user')
# item_prediction = predict(train_data_matrix, item_correlation, type='item')

In [25]:
# small_data = df.sample(frac=0.2)[['userId', 'movieId', 'rating']]
# small_data.head()

Unnamed: 0,userId,movieId,rating
71508,125,1120,4.5
93747,606,2473,2.5
99693,517,165489,2.5
64941,45,3972,5.0
62222,249,2605,3.5


In [28]:
# small_data.isnull().sum()

userId     0
movieId    0
rating     0
dtype: int64