In [None]:
# !pip install scikit-surprise
# !conda install -c conda-forge scikit-surprise

In [2]:
import pandas as pd
import re
import numpy as np
import random


pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(palette="magma_r")

In [9]:
from surprise import Reader
from surprise import SVD, SVDpp
from surprise import KNNBasic
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV

In [3]:
my_seed = 55
random.seed(my_seed)
np.random.seed(my_seed)

In [149]:
df_users = pd.read_csv("../..//data/processed/users.csv")
df_items = pd.read_csv("../..//data/processed/items.csv")
df_items.item_name_lower = df_items.item_name_lower.map(lambda x : x[:25])
df_items.columns

Index(['user_id', 'brand_id', 'store_id', 'product_id', 'item_name_lower',
       'product_category_id', 'paid_price', 'size', 'on_sale', 'part_of_order',
       'top_brand', 'color', 'Category Name', 'price_bin'],
      dtype='object')

In [150]:
## Unique products
products =df_items.copy()
products.columns
products.drop(["user_id","store_id","paid_price","part_of_order","top_brand","color", "Category Name"],axis=1, inplace=True)
products.drop_duplicates(subset="product_id", inplace=True)
products.head()

Unnamed: 0,brand_id,product_id,item_name_lower,product_category_id,size,on_sale,price_bin
0,loft,62733a41,petite textured pencil pa,123 - Pants,,False,3
1,loft,7ca9f965,blurred fairisle sweater,114 - Knits,,False,3
2,loft,6273435d,lou grey eyelash sweater,114 - Knits,,False,3
3,loft,62732b46,petite plaid pencil pants,123 - Pants,,False,3
4,loft,627342fa,petite custom stretch tro,123 - Pants,,False,3


In [181]:
"""
This function creates a sparse matrix and a simple group by for user - product combinations
When store_cat is set to True, it uses product category and store as proxy for product 
Note : change this to create sparse matrices instead of returning pandas dataframes
"""
def get_user_prod_matrix(df, store_cat = True):
    df_items["store_cat"] = df_items.apply(lambda x : x["store_id"] + " - "+ str(x["product_category_id"]), axis=1)
    
    if store_cat:
        sparse = pd.pivot_table(df_items, index="user_id", columns="store_cat", values="product_id", aggfunc="count")
    else:
        sparse = pd.crosstab(index = df_items["user_id"], columns = df_items["product_id"], values="product_id", aggfunc="count")
    
    grp = sparse.stack().dropna().reset_index()       
    grp = grp.rename(columns={0:"rating"} )

    if store_cat:
        grp = grp.rename(columns = {"store_cat": "product_id"})


    return (grp, sparse)

In [206]:
x, y = get_user_prod_matrix(df_items)
print(x.head())
x2, y2 = get_user_prod_matrix(df_items, False)
print(x2.head())
unique_user_sample = x2.user_id.sample(5).values.tolist()
print(unique_user_sample)


         user_id                    product_id  rating
0  1485369350003   dsw - 210 - Boots & Booties     2.0
1  1485369350003              hm - 124 - Jeans     1.0
2  1485369350003  jcrewfactory - 111 - Blouses     1.0
3  1485369350003    jcrewfactory - 114 - Knits     1.0
4  1485369350003          loft - 111 - Blouses     5.0
         user_id product_id  rating
0  1485369350003   00799605     1.0
1  1485369350003   01c00a9f     1.0
2  1485369350003   1389d2dd     1.0
3  1485369350003   1ff87a10     1.0
4  1485369350003   23583555     1.0
[1532180138666, 1542061238350, 1495121490933, 1514587641548, 1497978280616]


In [176]:
# # x=x.fillna(0)
# reader = Reader()
# data = Dataset.load_from_df(x,reader)

# tset1 = data.build_full_trainset()
# anti1 = tset1.build_anti_testset()

# trainset, testset = train_test_split(data, test_size=.25, random_state=10)
# algo = SVD(n_factors=5,random_state=10)
# algo = algo.fit(trainset)
# predict = algo.test(testset)
# accuracy.rmse(predict)

In [175]:
# cv = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# algo.test(testset)

In [174]:
# # x2=x2.fillna(0)
# reader2 = Reader()
# data2 = Dataset.load_from_df(x2,reader)
# trainset2, testset2 = train_test_split(data2, test_size=.25, random_state=10)
# algo2 = SVD(n_factors=5, random_state=10)
# algo2 = algo.fit(trainset2)
# predict2 = algo.test(testset2)
# accuracy.rmse(predict2)

In [173]:
# cv2 = cross_validate(algo2, data2, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# algo.test(testset2)

In [None]:
cv

In [None]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, refit=True)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

gs.test(testset)

In [None]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, refit="mae")

gs.fit(data2)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])



# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])



gs.test(testset2)

In [None]:
gs.best_estimator["rmse"]

In [None]:
gs.best_estimator["mae"]

In [None]:
gs.best_score

In [None]:
gs.best_params

In [None]:
gs.best_estimator["rmse"].fit(trainset2).test(testset2)

In [None]:
gs.best_estimator["mae"].fit(trainset2).test(testset2)

In [None]:
gs.cv_results

In [172]:
# param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
#               'reg_all': [0.4, 0.6]}
# gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True, n_jobs=-1)

# gs.fit(data2)

# # best RMSE score
# print(gs.best_score['rmse'])
# print(gs.best_score['mae'])



# # combination of parameters that gave the best RMSE score
# print(gs.best_params['rmse'])
# print(gs.best_params['mae'])



# gs.test(testset2)

In [171]:
# param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
#               'reg_all': [0.4, 0.6]}
# gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, refit=True, n_jobs=-1)

# gs.fit(data2)

# # best RMSE score
# print(gs.best_score['rmse'])
# print(gs.best_score['mae'])



# # combination of parameters that gave the best RMSE score
# print(gs.best_params['rmse'])
# print(gs.best_params['mae'])



# gs.test(testset2)

In [170]:
# param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
#               'reg_all': [0.4, 0.6]}
# gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, refit=True, n_jobs=-1)

# gs.fit(data)

# # best RMSE score
# print(gs.best_score['rmse'])
# print(gs.best_score['mae'])



# # combination of parameters that gave the best RMSE score
# print(gs.best_params['rmse'])
# print(gs.best_params['mae'])



# gs.test(testset)

In [169]:
# param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
#             'reg_all': [0.4, 0.6]}
# gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True, n_jobs=-1)

# gs.fit(data)

# # best RMSE score
# print(gs.best_score['rmse'])
# print(gs.best_score['mae'])



# # combination of parameters that gave the best RMSE score
# print(gs.best_params['rmse'])
# print(gs.best_params['mae'])



# gs.test(anti1)

In [17]:
# xx = data.df

In [168]:
# xx[xx.user_id == 1485369350003]

In [167]:
# tset1.global_mean

In [20]:
# anti2 = tset1.build_anti_testset(fill=0)

In [166]:
# gs.test(anti2)

In [22]:
# test1 = tset1.build_testset()

In [165]:
# gs.test(test1)

In [154]:
def get_user_products(uid, df, n_count=10):
    cols_tokeep = ["product_id", "brand_id", "product_category_id", "rating","price_bin", "item_name_lower"]   
    col_names = ["Product ID", "Brand", "Category", "Times Bought","Price Bin", "Descr"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count).merge(products, on="product_id")[cols_tokeep] 
    user_top_products.columns = col_names
    return user_top_products

In [191]:
def get_user_storecats(uid, df, n_count=10):
    cols_tokeep = ["product_id", "rating"]   
    col_names = ["Store - Category", "Times Bought"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count)[cols_tokeep]
    user_top_products.columns = col_names
    return user_top_products

In [193]:
def get_n_recommendations(user_id, scores, n_recommendations=10):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "product_id", "brand_id", "product_category_id", "price_bin", "item_name_lower" ]
    col_names = ["Rating Estimate", "Product ID", "Brand", "Category", "Price Bin", "Descr"]
    temp_df = scores.sort_values("est", ascending=False).head(count).rename(columns={"iid":"product_id"}).merge(products, on="product_id")[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [198]:
def get_n_recommendations_cat(user_id, scores, n_recommendations=10):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "iid"]
    col_names = ["Rating Estimate", "Store - Category"]
    temp_df = scores.sort_values("est", ascending=False).head(count)[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [210]:
x2, y2 = get_user_prod_matrix(df_items, False)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6],
              'random_state':[35]
            }
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, refit=True, n_jobs=-1)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))






0.1446300446477782
0.035316332650193426
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6, 'random_state': 35}
{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.4, 'random_state': 35}


In [211]:
for i in unique_user_sample:
    
    print("\n\n")
    print("*" * 60)
    
    print(f"\nUser {i} known purchases:")
    print("\n====================")
    print(get_user_products(i, x2,10))
    
    
    print(f"\nTop Recommendations for the user {i}:")
    print("\n================================:")
    print(get_n_recommendations(i,scores,7))
    




************************************************************

User 1532180138666 known purchases:

  Product ID Brand               Category  Times Bought  Price Bin  \
0   2a009ef7  None            123 - Pants           3.0          2   
1   2ab35afb  None  210 - Boots & Booties           3.0          3   
2   afe796e7  None            114 - Knits           2.0          3   
3   2af4b449  None           141 - Casual           2.0          2   
4   2a7d5ecc  None           141 - Casual           2.0          3   
5   7f393f00  None           141 - Casual           2.0          3   
6   2aed9634  None          111 - Blouses           2.0          2   
7   98823230  None            240 - Heels           2.0          2   
8   452a80e9  None          155 - Hoodies           2.0          3   
9   99417533  None         112 - T Shirts           2.0          2   

                       Descr  
0  wit wisdom absolution str  
1  vince camuto fileana spli  
2  halogen ruffle sleeve swe  
3  

In [212]:
x2, y2 = get_user_prod_matrix(df_items)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6],
              'random_state':[35]
             }
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))

3.256430717447799
1.6530122411887442
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4, 'random_state': 35}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6, 'random_state': 35}


In [213]:
# unique_user_sample = x2.user_id.sample(5).values.tolist()
for i in unique_user_sample:
    
    print("\n\n")
    print("*" * 60)
    
    print(f"\nUser {i} known purchases:")
    print("\n====================")
    print(get_user_storecats(i, x2,10))
    
    
    print(f"\nTop Recommendations for the user {i}:")
    print("\n================================:")
    print(get_n_recommendations_cat(i,scores,7))




************************************************************

User 1532180138666 known purchases:

                           Store - Category  Times Bought
1790              nordstrom - 111 - Blouses          24.0
1817                  oldnavy - 114 - Knits          15.0
1814                oldnavy - 111 - Blouses          12.0
1815               oldnavy - 112 - T Shirts          12.0
1747  americaneagleoutfitters - 114 - Knits          11.0
1830               poshmark - 111 - Blouses          11.0
1791             nordstrom - 112 - T Shirts          10.0
1820                 oldnavy - 141 - Casual          10.0
1799              nordstrom - 155 - Hoodies           9.0
1796               nordstrom - 141 - Casual           9.0

Top Recommendations for the user 1532180138666:

       Rating Estimate                   Store - Category
49563         5.000000       lululemon - 160 - Activewear
49539         5.000000       fabletics - 160 - Activewear
50034         4.917066           for

In [None]:
x2, y2 = get_user_prod_matrix(df_items)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6],
              'random_state':[35]
             }
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))