In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_set=pd.read_csv("/kaggle/input/amazon-product-reviews/ratings_Electronics (1).csv",names=['userid', 'productid','rating','timestamp'])
data_set.info()
data_set.shape

# **EDA**

In [None]:
data_set_buffer = data_set.drop('timestamp',axis=1)
#consider only 10% of the data
data_set_buffer = data_set_buffer.sample(frac=0.1)
del data_set

In [None]:
data_set_buffer.head().T

In [None]:
#print missing value
print(data_set_buffer.isna().sum())
print(data_set_buffer.isnull().sum())

In [None]:
data_set_buffer.describe().T

In [None]:
data_set_buffer.rating.describe().T

In [None]:
data_set_buffer.groupby('userid')['rating'].count().sort_values(ascending=False)

In [None]:
sns.set(style="white", palette="tab10", color_codes=True)

In [None]:
ax = sns.countplot(data=data_set_buffer,x='rating');
ax.set_ylim(0, len(data_set_buffer))
ax.set_xlim(0, 5)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:%}'.format(height/float(len(data_set_buffer))),
            ha="center") 
plt.show();

**Observation : 56% of users have rated 5. So we have highest number of 5 ratings.**

In [None]:
data_set_buffer_with_threshold50=data_set_buffer.groupby("productid").filter(lambda x:x['rating'].count() >=50)
del data_set_buffer

**keep the users only who has given 50 or more number of ratings**

In [None]:
data_set_buffer_with_threshold50.groupby('productid')['rating'].count().sort_values(ascending=False)

# **Build Popularity Recommender model**
Our definition of popularity : A product with highest average rating meets the basic criteria of atleast reviewed by 50 unique users.

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data_set_buffer_with_threshold50, test_size = 0.3, random_state=0)
train_data.head()

In [None]:
#Count no of user_id for each unique product as recommendation score 
train_data_grouped = train_data.groupby('productid').agg({'userid': 'count'}).reset_index()
train_data_grouped.rename(columns = {'userid': 'noofusers'},inplace=True)
train_data_grouped.head()

In [None]:
#Count no of user_id for each unique product as recommendation score 
train_data_grouped_rating= train_data.groupby(['productid'])['rating'].sum().reset_index()
train_data_grouped_rating.rename(columns = {'rating': 'ratingsum'},inplace=True)
train_data_grouped_rating.head()

In [None]:
#top five prouducts as per their avg rating
#Count of user_id for each unique product as recommendation score 
train_data_grouped_users = train_data.groupby('productid').agg({'userid': 'count'}).reset_index()
train_data_grouped_users.rename(columns = {'userid': 'noofuser'},inplace=True)
train_data_grouped_users.head()

In [None]:
train_data_merged_grouped = pd.merge(train_data_grouped_rating, train_data_grouped_users, on='productid')
train_data_merged_grouped.head()

In [None]:
train_data_merged_grouped['averagerating']= train_data_merged_grouped['ratingsum']/train_data_merged_grouped['noofuser']
train_data_merged_grouped.head()

In [None]:
train_data_merged_grouped.sort_values('averagerating',ascending=False)

# TOP 5 popular products

In [None]:
# Find top 5 popular products
train_data_merged_grouped.sort_values('averagerating',ascending=False).head(5)

In [None]:
del train_data_grouped
del train_data_grouped_rating
del train_data_merged_grouped

# Collaborative Filtering model.

In [None]:
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

In [None]:
#Load the dataframe to surprise. Observation : Got memory error so considering almost 1% only of the original dataset!!
data_set_buffer_with_threshold50 = data_set_buffer_with_threshold50.sample(frac=0.1)
data = Dataset.load_from_df(data_set_buffer_with_threshold50,Reader(rating_scale=(1, 5)))
trainset, testset = train_test_split(data, test_size=.30)

In [None]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
userusercollaborativefiltering = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': True})
userusercollaborativefiltering.fit(trainset)

In [None]:
trainset.n_users

In [None]:
test_pred = userusercollaborativefiltering.test(testset)

In [None]:
#RMSE
print("User-based Model : Test Set RMSE score")
accuracy.rmse(test_pred, verbose=True)

In [None]:
# item-based collaborative filtering
itembasedcollaborativefiltering = KNNWithMeans(k=5, sim_options={'name': 'pearson_baseline', 'user_based': False})
itembasedcollaborativefiltering.fit(trainset)

In [None]:
test_pred_I = itembasedcollaborativefiltering.test(testset)

In [None]:
#RMSE
print("Irem-based Model : Test Set RMSE score")
accuracy.rmse(test_pred_I, verbose=True)

In [None]:
del test_pred_I
del trainset
del data

**Recommend top 5 products to every user**

# **SVD or Matrix Factorization**

In [None]:
from collections import defaultdict
from surprise import SVD

In [None]:
# First train an SVD algorithm with dataset.data_set_buffer = data_set_buffer.sample(frac=0.1)
dataset_svd = data_set_buffer_with_threshold50.sample(frac=0.01)
dataset_svd = Dataset.load_from_df(dataset_svd,Reader(rating_scale=(1, 5)))

In [None]:
trainset = dataset_svd.build_full_trainset()
svd_algo = SVD()
svd_algo.fit(trainset)

In [None]:
#Predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()

In [None]:
predictions = svd_algo.test(testset)

In [None]:
def get_top_n_recommendations(reccomemndations, n=5):
    # First map the reccommendations to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in reccomemndations:
        top_n[uid].append((iid, est))

    #sort predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
top_5 = get_top_n_recommendations(predictions, n=5)

### Top 5 products for each user:

In [None]:
for uid, user_ratings in top_5.items():
    print(uid, [iid for (iid, _) in user_ratings])

**Observation :**
1. We have 1st explored the basic EDA where we got to know the large no of users rated 5 and we have considered the products minimum of 50 users reviewed.
2. We used a  popularity based recommender model where we recommended top 5 products where we had cold start problem.(No user info/user insight availble)
3. We have developed a Collaborative Filtering model where we recommended similar users with similar products and similar product items to different users. We call it user-user and item-item collaborative filtering model. For similarity we have used pearson correlation but we can use cosine similarity also to find out distance between our feature vectors.
4. We have also used Matrix factorization or SVD (Singular Vector Decomposition) to develop a model where we can recommend set of 5 products to each individual users.

Note : I am not sure if we can use apriori or market basket analysis to recommend products. Please do let me know if we can use market basket analysis here too. My idea is we don't have support or lift data required or a kind of basket available in the form of input data. Please vote up if you found helpful and happy learning!
    