# <center> Recommendation System on Movie Lens Dataset

In [2]:
# Necessary imports

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings; warnings.simplefilter('ignore')

from sklearn.cross_validation import train_test_split
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from math import sqrt
from ast import literal_eval



## Collaborative Filtering

In [3]:
# Read 'ratings' data
# ratings = pd.read_csv('data_full/ratings.csv')
ratings_small = pd.read_csv('data_full/ratings_small.csv')

In [4]:
ratings_small.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_small = ratings_small.drop(['timestamp'], axis = 1)
ratings_small.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [5]:
# Fill NaN values in user_id and movie_id column with 0
ratings_small['user_id'] = ratings_small['user_id'].fillna(0)
ratings_small['movie_id'] = ratings_small['movie_id'].fillna(0)

In [6]:
# Replace NaN values in rating column with average of all values
ratings_small['rating'] = ratings_small['rating'].fillna(ratings_small['rating'].mean())

In [7]:
ratings_small.head()

Unnamed: 0,user_id,movie_id,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [8]:
print(ratings_small.shape)

(100004, 3)


**Note:** Python throws 'Memory' Error when I use the full dataset. Hence, I pick 25% of the dataset and perform collborative filtering on it.

In [9]:
# Randomly sample 25% of the ratings dataset
small_data = ratings_small.sample(frac=0.25)

In [10]:
# Check the sample info
print(small_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25001 entries, 27052 to 85797
Data columns (total 3 columns):
user_id     25001 non-null int64
movie_id    25001 non-null int64
rating      25001 non-null float64
dtypes: float64(1), int64(2)
memory usage: 781.3 KB
None


In [11]:
train_data, test_data = train_test_split(small_data, test_size=0.2)

In [12]:
# Test and Train data matrix
train_data_matrix = train_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])
test_data_matrix = test_data.as_matrix(columns = ['user_id', 'movie_id', 'rating'])

In [13]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
85067,571,34338,4.5
11050,73,6338,3.0
25459,187,586,3.5
18462,120,4306,2.5
20890,144,253,3.0


In [14]:
train_data_matrix[:4]

array([[5.7100e+02, 3.4338e+04, 4.5000e+00],
       [7.3000e+01, 6.3380e+03, 3.0000e+00],
       [1.8700e+02, 5.8600e+02, 3.5000e+00],
       [1.2000e+02, 4.3060e+03, 2.5000e+00]])

**Idea behind user and item similarity:**

User similarity can be calculated by measuring 'pairwise distances' between ratings datset.

However, if you have to calculate the 'item similarity', we have to transpose the 'ratings' data and then calculate the pairwise distances.

In [15]:
# User Similarity Matrix
user_correlation = 1 - pairwise_distances(train_data, metric = 'correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[1.         0.99998855 0.95570253 0.9999543 ]
 [0.99998855 1.         0.9542832  0.9998971 ]
 [0.95570253 0.9542832  1.         0.9584729 ]
 [0.9999543  0.9998971  0.9584729  1.        ]]


In [16]:
# Item Similarity Matrix (Train_data_matrix.Transpose)
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric = 'correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.          0.00139149  0.00804213]
 [ 0.00139149  1.         -0.02296115]
 [ 0.00804213 -0.02296115  1.        ]]


In [17]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
85067,571,34338,4.5
11050,73,6338,3.0
25459,187,586,3.5
18462,120,4306,2.5
20890,144,253,3.0


In [18]:
train_data.T.head()

Unnamed: 0,85067,11050,25459,18462,20890,84247,17671,90016,45030,39909,...,51616,68364,65475,31402,81420,78115,26061,19625,56566,74465
user_id,571.0,73.0,187.0,120.0,144.0,564.0,118.0,598.0,316.0,292.0,...,380.0,475.0,466.0,227.0,553.0,544.0,190.0,130.0,407.0,518.0
movie_id,34338.0,6338.0,586.0,4306.0,253.0,2870.0,3256.0,1.0,1196.0,1672.0,...,5617.0,6586.0,1370.0,2316.0,102123.0,4914.0,2762.0,2951.0,3751.0,1958.0
rating,4.5,3.0,3.5,2.5,3.0,4.0,4.0,4.5,4.0,4.0,...,3.5,2.5,3.0,4.0,4.5,5.0,5.0,4.0,5.0,5.0


In [19]:
user_correlation[:1]

array([[1.        , 0.99998855, 0.95570253, ..., 0.99972598, 0.99651712,
        0.97092856]])

In [20]:
item_correlation[:1]

array([[1.        , 0.00139149, 0.00804213]])

In [21]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)                     # Calculate mean of ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])  # Use np.newaxis so that mean_user_rating has same format as ratings
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
    return pred

In [22]:
# Predict ratings on the training data with both similarity score
user_prediction = predict(train_data_matrix, user_correlation, type = 'user')
item_prediction = predict(train_data_matrix, item_correlation, type = 'item')

In [23]:
# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [24]:
# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 17677.833472568193
Item-based CF RMSE: 21050.47348294261


## Potential Next Steps:

**Suggestions for Content-Based filtering from other data scietists I met during the meet-up:**

1. Use weighted average on each movie:
    - How about multiplying `rating count` and `average rating`.
    - For a linear column, there can be huge variance. [Try normalize and standardize]
<br>
2. Use metadata td-idf matrix (cosine similarity) rather than just the movies.
    - Use 'word2vec'
<br>
3. For collaborative filtering - try 'movie-movie' similarity and 'user-user' similarity (Computationally Expensive)
<br>
4. Try to build a Hybrid Recommender