# Recommendation System
- User based recommendation
- User based prediction
- Item based recommendation
- Item based prediction
- Evaluation

In [None]:
# import libraties
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
ratings = pd.read_csv('../input/movie-lens-dataset/ratings.csv', encoding='latin-1')

In [None]:
ratings.head()

## Dividing the dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.30, random_state=31)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# pivot ratings into movie features
df_movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [None]:
df_movie_features.head()

### Copy train and test dataset
These dataset will be used for prediction and evaluation. 
- Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction. 
- Dummy test will be used for evaluation. To evaluate, we will only make prediction on the movies rated by the user. So, this is marked as 1. This is just opposite of dummy_train

In [None]:
dummy_train = train.copy()
dummy_test = test.copy()

In [None]:
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)

In [None]:
# The movies not rated by user is marked as 1 for prediction. 
dummy_train = dummy_train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(1)

# The movies not rated by user is marked as 0 for evaluation. 
dummy_test = dummy_test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [None]:
dummy_train.head()

In [None]:
dummy_test.head()

# User Similarity Matrix

## Using Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_movie_features, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

In [None]:
user_correlation.shape

## Using adjusted Cosine 

### Here, not removing the NaN values and calculating the mean only for the movies rated by the user

In [None]:
movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [None]:
movie_features.head()

### Normalising the rating of the movie for each user aroung 0 mean

In [None]:
mean = np.nanmean(movie_features, axis=1)
df_subtracted = (movie_features.T-mean).T

In [None]:
df_subtracted.head()

### Finding cosine similarity

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

## Prediction

Doing the prediction for the users which are positively related with other users, and not the users which are negatively related as we are interested in the users which are more similar to the current users. So, ignoring the correlation for values less than 0. 

In [None]:
user_correlation[user_correlation<0]=0
user_correlation

Rating predicted by the user (for movies rated as well as not rated) is the weighted sum of correlation with the movie rating (as present in the rating dataset). 

In [None]:
user_predicted_ratings = np.dot(user_correlation, movie_features.fillna(0))
user_predicted_ratings

In [None]:
user_predicted_ratings.shape

Since we are interested only in the movies not rated by the user, we will ignore the movies rated by the user by making it zero. 

In [None]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

### Finding the top 5 recommendation for the user 1 

In [None]:
user_final_rating.iloc[1].sort_values(ascending=False)[0:5]

# Item Based Similarity

Using Correlation

Taking the transpose of the rating matrix to normalize the rating around the mean for different movie ID. In the user based similarity, we had taken mean for each user intead of each movie. 

In [None]:
movie_features = train.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

movie_features.head()

Normalising the movie rating for each movie

In [None]:
mean = np.nanmean(movie_features, axis=1)
df_subtracted = (movie_features.T-mean).T

In [None]:
df_subtracted.head()

Finding the cosine similarity. Note that since the data is normalised, both the cosine metric and correlation metric will give the same value. 

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# User Similarity Matrix
item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

Filtering the correlation only for which the value is greater than 0. (Positively correlated)

In [None]:
item_correlation[item_correlation<0]=0
item_correlation

# Prediction

In [None]:
item_predicted_ratings = np.dot((movie_features.fillna(0).T),item_correlation)
item_predicted_ratings

In [None]:
item_predicted_ratings.shape

In [None]:
dummy_train.shape

### Filtering the rating only for the movies not rated by the user for recommendation

In [None]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

### Top 5 prediction for the user -1

In [None]:
item_final_rating.iloc[1].sort_values(ascending=False)[0:5]

# Evaluation

Evaluation will we same as you have seen above for the prediction. The only difference being, you will evaluate for the movie already rated by the user insead of predicting it for the movie not rated by the user. 

## Using User Similarity

In [None]:
test_movie_features = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)
mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

# User Similarity Matrix
test_user_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_user_correlation[np.isnan(test_user_correlation)] = 0
print(test_user_correlation)

In [None]:
test_user_correlation[test_user_correlation<0]=0
test_user_predicted_ratings = np.dot(test_user_correlation, test_movie_features.fillna(0))
test_user_predicted_ratings

### Doing prediction for the movies rated by the user

In [None]:
test_user_final_rating = np.multiply(test_user_predicted_ratings,dummy_test)

In [None]:
test_user_final_rating.head()

### Calculating the RMSE for only the movies rated by user. For  RMSE, normalising the rating to (1,5) range. 

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_user_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

## Using Item similarity

In [None]:
test_movie_features = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).T

mean = np.nanmean(test_movie_features, axis=1)
test_df_subtracted = (test_movie_features.T-mean).T

test_item_correlation = 1 - pairwise_distances(test_df_subtracted.fillna(0), metric='cosine')
test_item_correlation[np.isnan(test_item_correlation)] = 0
test_item_correlation[test_item_correlation<0]=0

In [None]:
test_item_correlation.shape

In [None]:
test_movie_features.shape

In [None]:
test_item_predicted_ratings = (np.dot(test_item_correlation, test_movie_features.fillna(0))).T
test_item_final_rating = np.multiply(test_item_predicted_ratings,dummy_test)
test_item_final_rating.head()

In [None]:
test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = test_item_final_rating.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))


test_ = test.pivot(
    index='userId',
    columns='movieId',
    values='rating'
)

# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

### Finding RMSE

In [None]:
rmse = (sum(sum((test_ - y )**2))/total_non_nan)**0.5
print(rmse)

## Conclusion: We can conclude based on the evaluation that User similarity performs better in terms of recommendation.