### Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
from sklearn.model_selection import train_test_split as train_test_split
from surprise import Reader
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')

### Reading and understanding the data-set

In [1]:
cols=['userId','productId','Rating','timestamp']
data=pd.read_csv("http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Electronics.csv",names=cols)

#### Checking the head of the data-set

In [1]:
data.head(10)

#### ● userId   : Every user identified with a unique id
#### ● productId : Every product identified with a unique id
#### ● Rating : Rating of the corresponding product by the corresponding user
#### ● timestamp : Time of the rating

#### Checking the data-types of the data

In [1]:
data.dtypes

##### UserId and ProductId are objects where as Ratings and timestamp are intergers

#### Timestamp is not necessary for this recommendation model approach and hence droping the same

In [1]:
data=data.drop(['timestamp'],axis=1)

#### Checking the information of the data

In [1]:
data.info()

##### There are nearly 78,24,482 records in the data-set, we are going to reduce the records in the datset for our analysis

#### Check for any null values in the data

In [1]:
na_values=data.isna().sum()
print(na_values)

##### There are no null values in this data-set

In [1]:
data.describe().T

##### Mean Rating is around 4 with a standard deviation of 1.4

In [1]:
#Find the minimum and maximum ratings
print('Minimum rating is: %d' %(data.Rating.min()))
print('Maximum rating is: %d' %(data.Rating.max()))

### Exploratory Data Analytics

#### Visualizing the Ratings attribute

In [1]:
sns.countplot(data['Rating'])

##### More number of users have rated 5 for a products compared to other ratings
##### Ratings are in the scale of 1-5

### Taking a subset of the dataset to make it less sparse/denser

#### Unique Users and products

In [1]:
print("Total number of records with unique users and products")
print("*"*100)
print("The total number of records in the data-set are:", data.shape[0])
print("The total number of unique users in the data-set are:", len(np.unique(data.userId)))
print("The total number of unique products in the data-set are:", len(np.unique(data.productId)))

#### Analyzing the rating

In [1]:
data_no_of_ratings_userId=data.groupby(by='userId')['Rating'].count().sort_values(ascending=False)

In [1]:
data_no_of_ratings_userId.head()

##### Sorting the user ID's in decending order based on the ratings they have provided

In [1]:
data_no_of_ratings_productId=data.groupby(by='productId')['Rating'].count().sort_values(ascending=False)  

In [1]:
data_no_of_ratings_productId.head()

##### Sorting the Product ID's in decending order based on the ratings they have received

#### Creating a new data-frame with the following conditions
##### - Only users who has given more than 50 ratings
##### - Only products which has received more than 100 ratings

In [1]:
#data_subset=data[data.groupby('userId')['userId'].transform('size')>50]
counts = data['userId'].value_counts()
data_subset = data[data['userId'].isin(counts[counts >= 50].index)]

In [1]:
##### data_subset=data_subset[data_subset.groupby('productId')['productId'].transform('size')>50]
counts = data_subset['productId'].value_counts()
data_subset = data_subset[data_subset['productId'].isin(counts[counts >= 100].index)]


In [1]:
data_subset.head()

In [1]:
data_subset.shape

##### We now have a data-set with just 6234 records

In [1]:
print("Total number of records with unique users and products")
print("*"*100)
print("The total number of records in the data-set are:", data_subset.shape[0])
print("The total number of unique users in the data-set are:", len(np.unique(data_subset.userId)))
print("The total number of unique products in the data-set are:", len(np.unique(data_subset.productId)))

In [1]:
final_ratings_matrix = data_subset.pivot(index = 'userId', columns ='productId', values = 'Rating').fillna(0)
print('Shape of final_ratings_matrix: ', final_ratings_matrix.shape)

given_num_of_ratings = np.count_nonzero(final_ratings_matrix)
print('given_num_of_ratings = ', given_num_of_ratings)
possible_num_of_ratings = final_ratings_matrix.shape[0] * final_ratings_matrix.shape[1]
print('possible_num_of_ratings = ', possible_num_of_ratings)
density = (given_num_of_ratings/possible_num_of_ratings)
density *= 100
print ('density: {:4.2f}%'.format(density))

#### Final Ratings Matrix

In [1]:
final_ratings_matrix.head()

##### The above shows it is a sparce matrix

### Simple Popularity Based Recomendation System

In [1]:
data_PBR=data_subset

#### Mean Ratings for Each Product

In [1]:
data_PBR.groupby('productId')['Rating'].mean().head(10)

#### Sorting the products with highest rating in decending order

In [1]:
data_PBR.groupby('productId')['Rating'].mean().sort_values(ascending=False).head(10)

#### Sorting the products with highest number of ratings in decending order

In [1]:
data_PBR.groupby('productId')['Rating'].count().sort_values(ascending=False).head(10)

In [1]:
mean_count_ratings=pd.DataFrame(data_PBR.groupby('productId')['Rating'].mean())

In [1]:
mean_count_ratings['Rating counts']=data_PBR.groupby('productId')['Rating'].count()

#### Identifying the popular products with Mean Rating greater than 4.5 and atleast 50 ratings

In [1]:
recommended_products=mean_count_ratings[(mean_count_ratings['Rating']>4.5) & (mean_count_ratings['Rating counts']>50)]

In [1]:
recommended_products

##### Above are some of the popular products available

#### Spliting the data

In [1]:
#Split the data randomnly into train and test datasets into 70:30 ratio
train_data, test_data = train_test_split(data_subset, test_size = 0.3, random_state=0)
train_data.head()

In [1]:
print('Shape of training data: ',train_data.shape)
print('Shape of testing data: ',test_data.shape)

### Building a proper Popularity Recommendation Model

In [1]:
train_data_grouped = train_data.groupby('productId').agg({'userId': 'count'}).reset_index()
train_data_grouped.rename(columns = {'userId': 'score'},inplace=True)
train_data_grouped.head()

In [1]:
#Sort the products on recommendation score 
train_data_sort = train_data_grouped.sort_values(['score', 'productId'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
train_data_sort['rank'] = train_data_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = train_data_sort.head(5) 
popularity_recommendations

#### Use popularity based recommender model to make predictions

In [1]:
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
          
    #Add user_id column for which the recommendations are being generated 
    user_recommendations['userId'] = user_id 
      
    #Bring user_id column to the front 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
          
    return user_recommendations

In [1]:
find_recom = [11,123,290]   # This list is user choice.
for i in find_recom:
    print("The list of recommendations for the userId: %d\n" %(i))
    print(recommend(i))    
    print("\n")

##### Since, this is a Popularity recommender model, all the three users are given the same recommendations. Here, we predict the products based on the popularity. It is not personalized to particular user. It is a non-personalized recommender system.

### Surprise Library-Collabrative Filtering Recomendation Model: Item-Item Recomendation

#### Initializing the reader

In [1]:
reader = Reader(rating_scale=(1, 5))

#### Creating the data-set


In [1]:
data_CFBR=data_subset
data_CFBR = Dataset.load_from_df(data_CFBR[['userId', 'productId', 'Rating']], reader)

#### Splitting the data

In [1]:
trainset, testset = surprise_train_test_split(data_CFBR, test_size=.3)

#### Creating model and fitting the trainset

In [1]:
CF_Model = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
CF_Model.fit(trainset)

In [1]:
#Prediction on the testset
test_pred = CF_Model.test(testset)

In [1]:
test_pred

In [1]:
#RMSE
print("Item-based Model : Train Set")
accuracy.rmse(test_pred, verbose=True)


##### RMSE score obtained is 0.89

### Surprise Library-Collabrative Filtering Recomendation Model: User-User Recomendation

#### Creating the model and fitting the train-set

In [1]:
CF_Model = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
CF_Model.fit(trainset)

In [1]:
#Getting predictions
test_pred = CF_Model.test(testset)

In [1]:
test_pred

In [1]:
# get RMSE
print("Item-based Model : Train Set")
accuracy.rmse(test_pred, verbose=True)


##### RMSE score obtained is 0.95

### User Based Collaborative Filtering model
#### Matrix with row per 'user' and column per 'item' 


In [1]:
pivot_df = data_subset.pivot(index = 'userId', columns ='productId', values = 'Rating').fillna(0)
pivot_df.head()

In [1]:
print('Shape of the pivot table: ', pivot_df.shape)

In [1]:
pivot_df['user_index'] = np.arange(0, pivot_df.shape[0], 1)
pivot_df.head()

In [1]:
pivot_df.set_index(['user_index'], inplace=True)
# Actual ratings given by users
pivot_df.head()

### Singular Value Decomposition

In [1]:
# Singular Value Decomposition
U, sigma, Vt = svds(pivot_df, k = 10)

In [1]:
print('Left singular matrix: \n',U)

In [1]:
print('Sigma: \n',sigma)

##### As sigma is not in diagonal matrix, we need conversion

In [1]:
# Construct diagonal array in SVD
sigma = np.diag(sigma)
print('Diagonal matrix: \n',sigma)

In [1]:
print('Right singular matrix: \n',Vt)

In [1]:
#Predicted ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
# Convert predicted ratings to dataframe
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = pivot_df.columns)
preds_df.head()

#### Function to Recommend the items to the users with the highest predicted ratings

In [1]:

def recommend_items(userID, pivot_df, preds_df, num_recommendations):
    # index starts at 0  
    user_idx = userID-1 
    # Get and sort the user's ratings
    sorted_user_ratings = pivot_df.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_ratings
    sorted_user_predictions = preds_df.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_predictions
    temp = pd.concat([sorted_user_ratings, sorted_user_predictions], axis=1)
    temp.index.name = 'Recommended Items'
    temp.columns = ['user_ratings', 'user_predictions']
    temp = temp.loc[temp.user_ratings == 0]   
    temp = temp.sort_values('user_predictions', ascending=False)
    print('\nBelow are the recommended items for user(user_id = {}):\n'.format(userID))
    print(temp.head(num_recommendations))

In [1]:
userID = 50
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)

In [1]:
userID = 5
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)

In [1]:
userID = 8
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)

##### Since, it is a Collaborative recommender model, so, all the three users are given different recommendations based on users past behaviour.


### Evaluation of Collabrative recommendation model

In [1]:
final_ratings_matrix.head()

#### Average actual rating for each item


In [1]:
final_ratings_matrix.mean().head()

#### Predicted ratings 


In [1]:
preds_df.head()

#### Average Predicted rating for each item


In [1]:

preds_df.mean().head()

In [1]:
rmse_df = pd.concat([final_ratings_matrix.mean(), preds_df.mean()], axis=1)
rmse_df.columns = ['Avg_actual_ratings', 'Avg_predicted_ratings']
print(rmse_df.shape)
rmse_df['item_index'] = np.arange(0, rmse_df.shape[0], 1)
rmse_df.head()

#### RMSE of Collabrative Model

In [1]:
RMSE = round((((rmse_df.Avg_actual_ratings - rmse_df.Avg_predicted_ratings) ** 2).mean() ** 0.5), 5)
print('\nRMSE SVD Model = {} \n'.format(RMSE))

#### Enter 'userID' and 'num_recommendations' for the user #


In [1]:
userID = 9
num_recommendations = 5
recommend_items(userID, pivot_df, preds_df, num_recommendations)


The Popularity-based recommender system is a non-personalised recommender system and these are based on frequecy counts, which may be not suitable to the user.We can see the differance above for the user id's, The Popularity based model has recommended the same set of 5 products to all but Collaborative Filtering based model has recommended entire different list based on the user past purchase history.

Model-based Collaborative Filtering is a personalised recommender system, the recommendations are based on the past behavior of the user and it is not dependent on any additional information.