# RMSE and MAE using Collaborative Filtering

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


##Importing Libraries

In [2]:
%pylab inline
import warnings
warnings.filterwarnings('ignore')
import os
os.chdir('/content/drive/My Drive/YelpDataset')

Populating the interactive namespace from numpy and matplotlib


In [0]:
# citation: https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import cross_validate as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error

In [0]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def mae(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return mean_absolute_error(prediction, ground_truth)


def collaborativeFiltering(reviews_source):
    reviews = pd.read_csv(reviews_source)
    reviews['text'] = reviews['text'].str[2:-2]

    
    print("The Dataset is Undersampled as it is a large Dataset to prevent runtime Crashing")
    
    #Balancing Dataset by undersampling
    review1 = reviews[reviews['stars'] == 1][0:2000]
    review2 = reviews[reviews['stars'] == 2][0:2000]
    review3 = reviews[reviews['stars'] == 3][0:2000]
    review4 = reviews[reviews['stars'] == 4][0:2000]
    review5 = reviews[reviews['stars'] == 5][0:2000]
    frames = [review1, review2, review3,review4,review5]
    reviews = pd.concat(frames)
    
    print("The Undersampling is completed")
    
    # Converting Business ID and User ID for the matrix
    reviews['user_id'] = pd.factorize(reviews.user_id)[0]
    reviews['business_id'] = pd.factorize(reviews.business_id)[0]
    
    # Generating unique users and restaurants from review dataset
    unique_users = reviews.user_id.unique().shape[0]
    unique_restaurants = reviews.business_id.unique().shape[0]
    
    #splitting the dataset
    train_data, test_data = train_test_split(reviews, test_size=0.20)

    #Creating two User-Item matrix for training and testing
    train_data_matrix = np.zeros((unique_users, unique_restaurants))
    
    print("User-Item matrix creation has started")
    
    # Training user-item matrix
    for line in train_data.itertuples():
         train_data_matrix[line[3], line[2]] = line[5]
            
    # Testing user-item matrix
    test_data_matrix = np.zeros((unique_users, unique_restaurants))
    for line in test_data.itertuples():
        test_data_matrix[line[3], line[2]] = line[5]
    
    print("User-Item matrix creation has been completed")
    
    print("Similarity matrix creation has begun")
    
   # User-User similarity is calculated using cosine similarity
    user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
    # Item-Item similarity is calculated using cosine similarity
    item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
    
    print("Similarity matrix creation has been completed")
    
    
    print("Prediction matrix creation based on Item and User has been started")
    
    item_prediction = predict(train_data_matrix, item_similarity, type='item')
    user_prediction = predict(train_data_matrix, user_similarity, type='user')
    
    print("Prediticon matrix creation has been completed")
    
    print('The Root Mean Square Error and Mean Absolute Error is being generated' + '\n')
    
    if reviews_source == 'reviews_restaurants_text.csv':
        rating_type = 'biased rating'
    elif reviews_source == 'reviews_restaurants_text_unbiased_svm.csv':
        rating_type = 'unbiased rating from Linear SVM'
    else:
        rating_type = 'unbiased rating from Naive Bayes'
    print ('The Root mean Square Error for Item-based and User-based similarity with' + rating_type)
    print ('The User-Based Collaborative Filtered Root Mean Square Error is: ' + str(rmse(user_prediction, test_data_matrix)))
    print ('The Item-Based Collaborative Filtered Root Mean Square Error is:' + str(rmse(item_prediction, test_data_matrix)) + '\n')

    print ('The Root mean Square Error for Item-based and User-based similarity with' + rating_type)
    print ('The User-Based Collaborative Filtered Root Mean Square Error is:' + str(rmse(user_prediction, train_data_matrix)))
    print ('The Item-Based Collaborative Filtered Root Mean Square Error is:' + str(rmse(item_prediction, train_data_matrix)) + '\n')
    
    print ('The Mean Absolute for Item-based and User-based similarity with' + rating_type)
    print ('The User-Based Collaborative Filtered Mean Absolute Error is:' + str(mae(user_prediction, test_data_matrix)))
    print ('The Item-Based Collaborative Filtered Root Mean Square Error is:' + str(mae(item_prediction, test_data_matrix)) + '\n')

    print ('The Mean Absolute for Item-based and User-based similarity with' + rating_type)
    print ('The User-Based Collaborative Filtered Mean Absolute Error is:' + str(mae(user_prediction, train_data_matrix)))
    print ('The Item-Based Collaborative Filtered Root Mean Square Error is:' + str(mae(item_prediction, train_data_matrix)) + '\n')   

## Filtering based on Biased Ratings

In [6]:
collaborativeFiltering('reviews_restaurants_text.csv')

The Dataset is Undersampled as it is a large Dataset to prevent runtime Crashing
The Undersampling is completed
User-Item matrix creation has started
User-Item matrix creation has been completed
Similarity matrix creation has begun
Similarity matrix creation has been completed
Prediction matrix creation based on Item and User has been started
Prediticon matrix creation has been completed
The Root Mean Square Error and Mean Absolute Error is being generated

The Root mean Square Error for Item-based and User-based similarity withbiased rating
The User-Based Collaborative Filtered Root Mean Square Error is: 3.315758126267929
The Item-Based Collaborative Filtered Root Mean Square Error is:3.3174030361627893

The Root mean Square Error for Item-based and User-based similarity withbiased rating
The User-Based Collaborative Filtered Root Mean Square Error is:3.313777500890382
The Item-Based Collaborative Filtered Root Mean Square Error is:3.3151086767609956

The Mean Absolute for Item-based 

## Filtering Based on UnBiased Ratings

In [7]:
collaborativeFiltering('reviews_restaurants_text_unbiased_svm.csv')

The Dataset is Undersampled as it is a large Dataset to prevent runtime Crashing
The Undersampling is completed
User-Item matrix creation has started
User-Item matrix creation has been completed
Similarity matrix creation has begun
Similarity matrix creation has been completed
Prediction matrix creation based on Item and User has been started
Prediticon matrix creation has been completed
The Root Mean Square Error and Mean Absolute Error is being generated

The Root mean Square Error for Item-based and User-based similarity withunbiased rating from Linear SVM
The User-Based Collaborative Filtered Root Mean Square Error is: 3.309791216966111
The Item-Based Collaborative Filtered Root Mean Square Error is:3.311251452365215

The Root mean Square Error for Item-based and User-based similarity withunbiased rating from Linear SVM
The User-Based Collaborative Filtered Root Mean Square Error is:3.3153240448613337
The Item-Based Collaborative Filtered Root Mean Square Error is:3.316637883112645