In [1]:
# !pip install scikit-surprise
# !conda install -c conda-forge scikit-surprise

In [27]:
import pandas as pd
import re
import numpy as np
import random
from scipy.stats import uniform as sp_rand
from scipy.stats import randint



pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(palette="magma_r")

In [3]:
from surprise import Reader
from surprise import SVD, SVDpp
from surprise import KNNBasic, KNNBaseline, KNNWithZScore, KNNWithMeans
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV

In [4]:
my_seed = 55
random.seed(my_seed)
np.random.seed(my_seed)

In [5]:
df_users = pd.read_csv("../..//data/processed/users.csv")
df_items = pd.read_csv("../..//data/processed/items.csv")
df_items.item_name_lower = df_items.item_name_lower.map(lambda x : x[:25])
df_items.columns

Index(['user_id', 'brand_id', 'store_id', 'product_id', 'item_name_lower',
       'product_category_id', 'paid_price', 'size', 'on_sale', 'part_of_order',
       'top_brand', 'color', 'Category Name', 'price_bin'],
      dtype='object')

In [6]:
## Unique products
products =df_items.copy()
products.columns
products.drop(["user_id","store_id","paid_price","part_of_order","top_brand","color", "Category Name"],axis=1, inplace=True)
products.drop_duplicates(subset="product_id", inplace=True)
products.head()

Unnamed: 0,brand_id,product_id,item_name_lower,product_category_id,size,on_sale,price_bin
0,loft,62733a41,petite textured pencil pa,123 - Pants,,False,3
1,loft,7ca9f965,blurred fairisle sweater,114 - Knits,,False,3
2,loft,6273435d,lou grey eyelash sweater,114 - Knits,,False,3
3,loft,62732b46,petite plaid pencil pants,123 - Pants,,False,3
4,loft,627342fa,petite custom stretch tro,123 - Pants,,False,3


In [7]:
"""
This function creates a sparse matrix and a simple group by for user - product combinations
When store_cat is set to True, it uses product category and store as proxy for product 
Note : change this to create sparse matrices instead of returning pandas dataframes
"""
def get_user_prod_matrix(df, store_cat = True):
    df_items["store_cat"] = df_items.apply(lambda x : x["store_id"] + " - "+ str(x["product_category_id"]), axis=1)
    
    if store_cat:
        sparse = pd.pivot_table(df_items, index="user_id", columns="store_cat", values="product_id", aggfunc="count")
    else:
        sparse = pd.crosstab(index = df_items["user_id"], columns = df_items["product_id"], values="product_id", aggfunc="count")
    
    grp = sparse.stack().dropna().reset_index()       
    grp = grp.rename(columns={0:"rating"} )

    if store_cat:
        grp = grp.rename(columns = {"store_cat": "product_id"})


    return (grp, sparse)

In [8]:
x, y = get_user_prod_matrix(df_items)
print(x.head())
x2, y2 = get_user_prod_matrix(df_items, False)
print(x2.head())
unique_user_sample = x2.user_id.sample(5).values.tolist()
print(unique_user_sample)


         user_id                    product_id  rating
0  1485369350003   dsw - 210 - Boots & Booties     2.0
1  1485369350003              hm - 124 - Jeans     1.0
2  1485369350003  jcrewfactory - 111 - Blouses     1.0
3  1485369350003    jcrewfactory - 114 - Knits     1.0
4  1485369350003          loft - 111 - Blouses     5.0
         user_id product_id  rating
0  1485369350003   00799605     1.0
1  1485369350003   01c00a9f     1.0
2  1485369350003   1389d2dd     1.0
3  1485369350003   1ff87a10     1.0
4  1485369350003   23583555     1.0
[1534200663687, 1540229647916, 1530899668174, 1495117820151, 1506291233259]


In [10]:
def get_user_products(uid, df, n_count=10):
    cols_tokeep = ["product_id", "brand_id", "product_category_id", "rating","price_bin", "item_name_lower"]   
    col_names = ["Product ID", "Brand", "Category", "Times Bought","Price Bin", "Descr"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count).merge(products, on="product_id")[cols_tokeep] 
    user_top_products.columns = col_names
    return user_top_products

In [11]:
def get_user_storecats(uid, df, n_count=10):
    cols_tokeep = ["product_id", "rating"]   
    col_names = ["Store - Category", "Times Bought"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count)[cols_tokeep]
    user_top_products.columns = col_names
    return user_top_products

In [12]:
def get_n_recommendations(user_id, scores, n_recommendations=10):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "product_id", "brand_id", "product_category_id", "price_bin", "item_name_lower" ]
    col_names = ["Rating Estimate", "Product ID", "Brand", "Category", "Price Bin", "Descr"]
    temp_df = scores.sort_values("est", ascending=False).head(count).rename(columns={"iid":"product_id"}).merge(products, on="product_id")[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [13]:
def get_n_recommendations_cat(user_id, scores, n_recommendations=10):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "iid"]
    col_names = ["Rating Estimate", "Store - Category"]
    temp_df = scores.sort_values("est", ascending=False).head(count)[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [14]:
reader = Reader()

In [39]:
x2, y2 = get_user_prod_matrix(df_items, False)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6],
              'random_state':[35]
            }
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, refit=True, n_jobs=-1)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))






0.14449769341444396
0.03539348597953063
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6, 'random_state': 35}
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4, 'random_state': 35}


In [40]:
for i in unique_user_sample:
    
    print("\n\n")
    print("*" * 60)
    
    print(f"\nUser {i} known purchases:")
    print("\n====================")
    print(get_user_products(i, x2,10))
    
    
    print(f"\nTop Recommendations for the user {i}:")
    print("\n================================:")
    print(get_n_recommendations(i,scores,7))
    




************************************************************

User 1534200663687 known purchases:

  Product ID        Brand          Category  Times Bought  Price Bin  \
0   00196351         None      122 - Shorts           1.0          1   
1   34866379  fashionnova       124 - Jeans           1.0          2   
2   341c1839    fabletics  160 - Activewear           1.0          2   
3   341f5189         None   171 - Swimsuits           1.0          3   
4   34310bc6   francescas        132 - Long           1.0          2   
5   3440c403    forever21     153 - Blazers           1.0          2   
6   3456eb6f  fashionnova     111 - Blouses           1.0          1   
7   3460c83d    fabletics  160 - Activewear           1.0          1   
8   34625c80      express       123 - Pants           1.0          2   
9   34f514ae    forever21       113 - Tanks           1.0          1   

                       Descr  
0  calvin klein high waisted  
1               canopy jeans  
2           

In [41]:
x2, y2 = get_user_prod_matrix(df_items)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


param_grid = {'n_epochs': [5, 10], 'lr_all': [1e-4, 0.001, 0.002, 0.005],
              'reg_all': [0.4, 0.6],
              'random_state':[35]
             }
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))

3.1956203709635425
1.6382187325434874
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4, 'random_state': 35}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6, 'random_state': 35}


In [42]:
# unique_user_sample = x2.user_id.sample(5).values.tolist()
for i in unique_user_sample:
    
    print("\n\n")
    print("*" * 60)
    
    print(f"\nUser {i} known purchases:")
    print("\n====================")
    print(get_user_storecats(i, x2,10))
    
    
    print(f"\nTop Recommendations for the user {i}:")
    print("\n================================:")
    print(get_n_recommendations_cat(i,scores,7))




************************************************************

User 1534200663687 known purchases:

                  Store - Category  Times Bought
2033       zaful - 171 - Swimsuits          14.0
2011       poshmark - 122 - Shorts           7.0
1976  fabletics - 160 - Activewear           7.0
2005     missguided - 141 - Casual           5.0
2017      poshmark - 410 - Jewelry           4.0
1999    missguided - 111 - Blouses           4.0
1984      forever21 - 141 - Casual           4.0
1960           asos - 141 - Casual           4.0
1964            asos - 460 - Belts           4.0
1970         express - 114 - Knits           3.0

Top Recommendations for the user 1534200663687:

       Rating Estimate              Store - Category
56290         5.000000  lululemon - 160 - Activewear
56292         3.598057     nordstrom - 111 - Blouses
56557         3.563033            hm - 111 - Blouses
56310         3.482141      oldnavy - 112 - T Shirts
56222         3.273327          loft - 111 -

In [49]:
x2, y2 = get_user_prod_matrix(df_items)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


param_grid = {'n_epochs': [5, 10, 15], 'lr_all': np.linspace(1e-4,2e-1,200),
              'reg_all': np.linspace(0,1,100),
              'random_state':[35]
             }
gs = RandomizedSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1, n_iter=50)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))

3.973290305609814
2.87181722681501
{'n_epochs': 15, 'lr_all': 0.031240201005025125, 'reg_all': 0.6767676767676768, 'random_state': 35}
{'n_epochs': 5, 'lr_all': 0.026217587939698493, 'reg_all': 0.08080808080808081, 'random_state': 35}


In [50]:
# unique_user_sample = x2.user_id.sample(5).values.tolist()
for i in unique_user_sample:
    
    print("\n\n")
    print("*" * 60)
    
    print(f"\nUser {i} known purchases:")
    print("\n====================")
    print(get_user_storecats(i, x2,10))
    
    
    print(f"\nTop Recommendations for the user {i}:")
    print("\n================================:")
    print(get_n_recommendations_cat(i,scores,7))




************************************************************

User 1534200663687 known purchases:

                  Store - Category  Times Bought
2033       zaful - 171 - Swimsuits          14.0
2011       poshmark - 122 - Shorts           7.0
1976  fabletics - 160 - Activewear           7.0
2005     missguided - 141 - Casual           5.0
2017      poshmark - 410 - Jewelry           4.0
1999    missguided - 111 - Blouses           4.0
1984      forever21 - 141 - Casual           4.0
1960           asos - 141 - Casual           4.0
1964            asos - 460 - Belts           4.0
1970         express - 114 - Knits           3.0

Top Recommendations for the user 1534200663687:

       Rating Estimate                 Store - Category
56218                5      dsw - 210 - Boots & Booties
57072                5  bananarepublic - 112 - T Shirts
57090                5            jcrew - 152 - Jackets
57089                5      fabletics - 340 - Backpacks
57088                5       

In [15]:
x2, y2 = get_user_prod_matrix(df_items)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)

def trainSVD(data):
    param_grid = {'n_epochs': [5, 10, 15], 'lr_all': np.linspace(1e-4,2e-1,200),
              'reg_all': np.linspace(0,1,100),
              'random_state':[35]
             }
    
    gs = RandomizedSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1, n_iter=50)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])
    print(gs.best_score['mae'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])
    print(gs.best_params['mae'])

    train_set = data.build_full_trainset()
    all_set = train_set.build_testset()
    anti_set = train_set.build_anti_testset()
    all_scores = pd.DataFrame(gs.test(all_set))
    anti_scores = pd.DataFrame(gs.test(anti_set))
    return (all_scores, anti_scores)

all_s, anti_s = trainSVD(data)

3.2029747788343705
1.5996091107450336
{'n_epochs': 15, 'lr_all': 0.011149748743718593, 'reg_all': 0.25252525252525254, 'random_state': 35}
{'n_epochs': 10, 'lr_all': 0.014163316582914573, 'reg_all': 0.4444444444444445, 'random_state': 35}


In [54]:
# unique_user_sample = x2.user_id.sample(5).values.tolist()
for i in unique_user_sample:
    
    print("\n\n")
    print("*" * 60)
    
    print(f"\nUser {i} known purchases:")
    print("\n====================")
    print(get_user_storecats(i, x2,10))
    
    
    print(f"\nTop Recommendations for the user {i}:")
    print("\n================================:")
    print(get_n_recommendations_cat(i,anti_s,7))




************************************************************

User 1534200663687 known purchases:

                  Store - Category  Times Bought
2033       zaful - 171 - Swimsuits          14.0
2011       poshmark - 122 - Shorts           7.0
1976  fabletics - 160 - Activewear           7.0
2005     missguided - 141 - Casual           5.0
2017      poshmark - 410 - Jewelry           4.0
1999    missguided - 111 - Blouses           4.0
1984      forever21 - 141 - Casual           4.0
1960           asos - 141 - Casual           4.0
1964            asos - 460 - Belts           4.0
1970         express - 114 - Knits           3.0

Top Recommendations for the user 1534200663687:

       Rating Estimate                           Store - Category
56290         5.000000               lululemon - 160 - Activewear
56245         4.499094                 athleta - 160 - Activewear
56557         3.962084                         hm - 111 - Blouses
56238         3.932637               rentther

In [55]:
# unique_user_sample = x2.user_id.sample(5).values.tolist()
for i in unique_user_sample:
    
    print("\n\n")
    print("*" * 60)
    
    print(f"\nUser {i} known purchases:")
    print("\n====================")
    print(get_user_storecats(i, x2,10))
    
    
    print(f"\nTop Recommendations for the user {i}:")
    print("\n================================:")
    print(get_n_recommendations_cat(i,all_s,7))




************************************************************

User 1534200663687 known purchases:

                  Store - Category  Times Bought
2033       zaful - 171 - Swimsuits          14.0
2011       poshmark - 122 - Shorts           7.0
1976  fabletics - 160 - Activewear           7.0
2005     missguided - 141 - Casual           5.0
2017      poshmark - 410 - Jewelry           4.0
1999    missguided - 111 - Blouses           4.0
1984      forever21 - 141 - Casual           4.0
1960           asos - 141 - Casual           4.0
1964            asos - 460 - Belts           4.0
1970         express - 114 - Knits           3.0

Top Recommendations for the user 1534200663687:

      Rating Estimate              Store - Category
2033         5.000000       zaful - 171 - Swimsuits
1976         5.000000  fabletics - 160 - Activewear
1960         4.418134           asos - 141 - Casual
2011         3.677708       poshmark - 122 - Shorts
1984         3.114829      forever21 - 141 - Casu

In [None]:
x2, y2 = get_user_prod_matrix(df_items)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


param_grid = {'n_epochs': [5, 10, 15], 'lr_all': np.linspace(1e-4,2e-1,200),
              'reg_all': np.linspace(0,1,100),
              'random_state':[35]
             }
gs = RandomizedSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1, n_iter=50)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))

In [42]:
x2.head()

Unnamed: 0,user_id,product_id,rating
0,1485369350003,dsw - 210 - Boots & Booties,2.0
1,1485369350003,hm - 124 - Jeans,1.0
2,1485369350003,jcrewfactory - 111 - Blouses,1.0
3,1485369350003,jcrewfactory - 114 - Knits,1.0
4,1485369350003,loft - 111 - Blouses,5.0


In [41]:
x2, y2 = get_user_prod_matrix(df_items)
reader2 = Reader()
data = Dataset.load_from_df(x2,reader)


# param_grid = {'n_epochs': [5, 10, 15], 'lr_all': np.linspace(1e-4,2e-1,200),
#               'reg_all': np.linspace(0,1,100),
#               'random_state':[35]
#              }

param_grid = {'k': [1,3,5,10,15,20,25,30,35,40,45,50],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1,3,5,10,15,20,25,30,35,40,45,50],
                              'user_based': [False, True]},
              }
gs = RandomizedSearchCV(KNNBasic, param_grid,  measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1, n_iter=50)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

scoring_set = data.build_full_trainset().build_anti_testset()
scores = pd.DataFrame(gs.test(scoring_set))



Computing the cosine similarity matrix...
Done computing similarity matrix.
3.2090959510215398
1.6580749748693095
{'k': 25, 'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}
{'k': 20, 'sim_options': {'name': 'cosine', 'min_support': 5, 'user_based': False}}


In [43]:
scores.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,1485369350003,abercrombiefitch - 111 - Blouses,2.333227,2.333227,"{'was_impossible': True, 'reason': 'Not enough..."
1,1485369350003,anthropologie - 141 - Casual,2.333227,4.814498,"{'actual_k': 8, 'was_impossible': False}"
2,1485369350003,asos - 111 - Blouses,2.333227,2.047616,"{'actual_k': 2, 'was_impossible': False}"
3,1485369350003,asos - 430 - Scarves,2.333227,2.333227,"{'was_impossible': True, 'reason': 'Not enough..."
4,1485369350003,athleta - 160 - Activewear,2.333227,2.17913,"{'actual_k': 2, 'was_impossible': False}"
