## Imports

In [None]:
!pip install surprise
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split as tts
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from math import sqrt

In [2]:
from surprise import prediction_algorithms

## Cleaning the Dataset, creating train, test CSVs (Do Not Run)

In [None]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Musical_Instruments_5.json.gz

In [52]:
!gzip -dk /content/Musical_Instruments_5.json.gz

In [54]:
df = pd.read_json('/content/Musical_Instruments_5.json', lines = True)
df.drop(['verified', 'reviewTime', 'style', 'reviewerName', 'summary', 'unixReviewTime', 'vote', 'image'], axis=1, inplace=True)

In [82]:
df.to_csv('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW2 RECOMMENDATION SYSTEM/train.csv', index=False)

##Loading Dataset

In [103]:
dataset = pd.read_csv('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW2 RECOMMENDATION SYSTEM/dataset.csv')

In [104]:
dataset = dataset[['reviewerID','asin', 'overall']]

In [105]:
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(dataset, reader)

In [106]:
# train = pd.read_csv('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW2 RECOMMENDATION SYSTEM/train.csv')
# test = pd.read_csv('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW2 RECOMMENDATION SYSTEM/test.csv')

In [107]:
# train = train[['reviewerID','asin', 'overall']]
# test = test[['reviewerID','asin', 'overall']]

## Collaborative Filtering - Item Based Recommendation

In [63]:
# reader = Reader(rating_scale=(1, 5))
# dataset = Dataset.load_from_df(dataset, reader)
# train = Dataset.load_from_df(train, reader)
# test = Dataset.load_from_df(test, reader)

In [108]:
trainset, testset = train_test_split(dataset, test_size=0.2, random_state=10)

In [109]:
knn = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7ffaf7583b50>

In [110]:
test_pred = knn.test(testset)

In [111]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9614


0.9614017609355584

In [112]:
accuracy.mae(test_pred, verbose=True)

MAE:  0.6009


0.600927166802767

##Matrix Factorization
Using SVD on a sparse matrix to predict ratings.

In [113]:
trainset, testset = train_test_split(dataset, test_size=0.2, random_state=10)

In [None]:
matrix_fact = prediction_algorithms.matrix_factorization.SVD(verbose=True, n_epochs = 50)
matrix_fact.fit(trainset)

In [115]:
test_pred = matrix_fact.test(testset)

In [116]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8909


0.8909225698878386

In [117]:
accuracy.mae(test_pred, verbose=True)

MAE:  0.5886


0.5886068209262666

## Custom Algorithm, Using Machine Learning

In [118]:
dataset = pd.read_csv('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW2 RECOMMENDATION SYSTEM/dataset.csv')

In [119]:
dataset = dataset.dropna()

In [120]:
reviews_groupby_users_Ratings = dataset.groupby('reviewerID')['overall']

In [121]:
reviews_groupby_users_Ratings = pd.DataFrame(reviews_groupby_users_Ratings.count())

In [122]:
user_list_min50_ratings = reviews_groupby_users_Ratings[reviews_groupby_users_Ratings['overall'] >= 50].index
dataset =  dataset[dataset['reviewerID'].isin(user_list_min50_ratings)]

In [123]:
# dataset

In [124]:
dataset["idprod"] = dataset['reviewerID'].astype(str) +" "+ dataset["asin"].astype(str)

In [125]:
X_train, X_test, y_train, y_test = tts(dataset['idprod'], dataset["overall"], random_state = 10, test_size = 0.2, stratify = dataset['reviewerID'])

In [126]:
regressor = Pipeline([
    ('vect', CountVectorizer(stop_words= "english")),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestRegressor(n_jobs= -1, verbose=True)),
    ])

In [127]:
custom_recommender = regressor.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   13.7s finished


In [128]:
y_predicted = regressor.predict(X_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished


In [129]:
sqrt(mean_squared_error(y_predicted, y_test))

0.8614226509572163

In [130]:
mean_absolute_error(y_predicted, y_test)

0.504825363068179

## Custom Algorithm, Regressor Using Review Text

This method might not necessarily be used as a recommender system as it takes in review text during inference. The prediction results however are much better than the earlier methods in terms of RMSE and MAE

### Imports

In [131]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, ndcg_score, accuracy_score, mean_squared_error, mean_absolute_error

In [132]:
dataset = pd.read_csv('/content/drive/MyDrive/INFORMATION RETRIEVAL/HW2 RECOMMENDATION SYSTEM/dataset.csv')

In [133]:
dataset = dataset.iloc[:10000, :]

In [134]:
dataset = dataset.dropna()

In [135]:
X_train, X_test, y_train, y_test = train_test_split(dataset["reviewText"], dataset["overall"],random_state = 42, test_size = 0.2)

In [136]:
classifier = Pipeline([
    ('vect', CountVectorizer(stop_words= "english")),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestRegressor(n_jobs= -1, verbose=True)),
    ])

In [137]:
model = classifier.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.9min finished


In [138]:
y_predicted = model.predict(X_test)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished


In [141]:
sqrt(mean_squared_error(y_predicted, y_test))

0.7361102074504452

In [140]:
mean_absolute_error(y_predicted, y_test)

0.4610654468073699