In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split 
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from IPython.display import display

In [None]:

# Reading ratings file
ratings = pd.read_csv('https://raw.githubusercontent.com/pphoebelu/input/product_ratings_final.csv',\encoding='latin-1')
# ratings.reset_index(drop=True, inplace=True)
display(ratings.sample(n=5, random_state=42))

In [None]:
# prepare the data

def apply_pivot(df,fillby = None):
    if fillby is not None:
        return df.pivot_table(index='userId', columns='prod_name',values='rating').fillna(fillby)
    return df.pivot_table(index='userId', columns='prod_name',values='rating')

In [None]:

#Dividing the dataset into train and test



train, test = train_test_split(ratings, test_size=0.30, random_state=42)
test = test[test.userId.isin(train.userId)]

#Apply pivot operation and fillna used to replace NaN values with 0 i.e. where user didn't made any rating

df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)

# dataset (train and test)

# Train
dummy_train = train.copy()
dummy_train['rating'] = dummy_train['rating'].apply(lambda x: 0 if x>=1 else 1)
dummy_train = apply_pivot(df = dummy_train, fillby = 1)

# Test
dummy_test = test.copy()
dummy_test['rating'] = dummy_test['rating'].apply(lambda x: 1 if x>=1 else 0)
dummy_test = apply_pivot(df = dummy_test, fillby = 0)

In [None]:
# to calculate mean, use only ratings given by user instead of fillna by 0 as it increase denominator in mean

mean = np.nanmean(apply_pivot(df = train), axis = 1)
df_train_subtracted = (apply_pivot(df = train).T-mean).T

# Make rating=0 where user hasn't given any rating

df_train_subtracted.fillna(0, inplace = True)

# Creating the User Similarity Matrix using pairwise_distance function
user_correlation = 1 - pairwise_distances(df_train_subtracted, metric='cosine')

user_correlation[np.isnan(user_correlation)] = 0

# user_correlation[user_correlation<0] = 0
# Convert the user_correlation matrix into dataframe

user_correlation_df = pd.DataFrame(user_correlation)
user_correlation_df['userId'] = df_train_subtracted.index
user_correlation_df.set_index('userId',inplace=True)
user_correlation_df.columns = df_train_subtracted.index.tolist()

In [None]:
#Find Top N recommendation for User (User-User) 




def find_top_recommendations(pred_rating_df, userid, topn):
    recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
    recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
    return recommendation

user_input = str(input("Enter your user id"))
recommendation_user_user = find_top_recommendations(user_final_rating, user_input, 5)
recommendation_user_user['userId'] = user_input
print("Earlier rated products by user id:{} as below".format(user_input))
display(train[train['userId']==user_input].sort_values(['rating'],ascending=False))
print("Earlier rated products by user id:{} as below".format(user_input))
display(train[train['userId']==user_input].sort_values(['rating'],ascending=False))



In [None]:

#Filter user correlation only for user which is in test, test is subset/equal of train in terms of userId


user_correlation_test_df = user_correlation_df[user_correlation_df.index.isin(test.userId)]
user_correlation_test_df = user_correlation_test_df[list(set(test.userId))]

# user_correlation_test_df[user_correlation_test_df<0]=0
#Get test user predicted rating

test_user_predicted_ratings = np.dot(user_correlation_test_df, df_test_pivot)
test_user_predicted_ratings = np.multiply(test_user_predicted_ratings,dummy_test)

#Get NaN where user never rated as it shouldn't contribute in calculating RMSE

test_user_predicted_ratings = test_user_predicted_ratings[test_user_predicted_ratings>0]
scaler = MinMaxScaler(feature_range=(1, 5))
scaler.fit(test_user_predicted_ratings)
test_user_predicted_ratings = scaler.transform(test_user_predicted_ratings)
total_non_nan = np.count_nonzero(~np.isnan(test_user_predicted_ratings))
rmse = (np.sum(np.sum((apply_pivot(df = test) - test_user_predicted_ratings)**2))/total_non_nan)**0.5


print(rmse)



In [None]:
#Save Mode
pickle.dump(user_final_rating,open('./model/user_final_rating.pkl','wb'))