In [1]:
# !pip install scikit-surprise
# !conda install -c conda-forge scikit-surprise

In [2]:
import pandas as pd
import re
import numpy as np
import random
from scipy.stats import uniform as sp_rand
from scipy.stats import randint
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)


In [3]:
from surprise import Reader
from surprise import SVD, SVDpp, NMF
from surprise import KNNBasic, KNNBaseline, KNNWithZScore, KNNWithMeans
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV

In [4]:
my_seed = 55
random.seed(my_seed)
np.random.seed(my_seed)

In [5]:
df_users = pd.read_csv("../..//data/processed/users.csv")
df_items = pd.read_csv("../..//data/processed/items.csv")
df_items.item_name_lower = df_items.item_name_lower.map(lambda x : x[:25])
df_items.columns

Index(['user_id', 'brand_id', 'store_id', 'product_id', 'item_name_lower',
       'product_category_id', 'paid_price', 'size', 'on_sale', 'part_of_order',
       'top_brand', 'color', 'Category Name', 'price_bin'],
      dtype='object')

In [6]:
## Unique products
products =df_items.copy()
products.columns
products.drop(["user_id","store_id","paid_price","part_of_order","top_brand","color", "Category Name"],axis=1, inplace=True)
products.drop_duplicates(subset="product_id", inplace=True)
products.head()

Unnamed: 0,brand_id,product_id,item_name_lower,product_category_id,size,on_sale,price_bin
0,loft,62733a41,petite textured pencil pa,123 - Pants,,False,3
1,loft,7ca9f965,blurred fairisle sweater,114 - Knits,,False,3
2,loft,6273435d,lou grey eyelash sweater,114 - Knits,,False,3
3,loft,62732b46,petite plaid pencil pants,123 - Pants,,False,3
4,loft,627342fa,petite custom stretch tro,123 - Pants,,False,3


In [7]:
"""
This function creates a sparse matrix and a simple group by for user - product combinations
When store_cat is set to True, it uses product category and store as proxy for product 
Note : change this to create sparse matrices instead of returning pandas dataframes
"""
def get_user_prod_matrix(df, store_cat = True):
    df_items["store_cat"] = df_items.apply(lambda x : x["store_id"] + " - "+ str(x["product_category_id"]), axis=1)
    
    if store_cat:
        sparse = pd.pivot_table(df_items, index="user_id", columns="store_cat", values="product_id", aggfunc="count")
    else:
        sparse = pd.crosstab(index = df_items["user_id"], columns = df_items["product_id"], values="product_id", aggfunc="count")
    
    grp = sparse.stack().dropna().reset_index()       
    grp = grp.rename(columns={0:"rating"} )

    if store_cat:
        grp = grp.rename(columns = {"store_cat": "product_id"})


    return (grp, sparse)

In [8]:
x1, y1 = get_user_prod_matrix(df_items)
print(x1.head())
x2, y2 = get_user_prod_matrix(df_items, False)
print(x2.head())
unique_user_sample = x2.user_id.sample(5).values.tolist()
print(unique_user_sample)


         user_id                    product_id  rating
0  1485369350003   dsw - 210 - Boots & Booties     2.0
1  1485369350003              hm - 124 - Jeans     1.0
2  1485369350003  jcrewfactory - 111 - Blouses     1.0
3  1485369350003    jcrewfactory - 114 - Knits     1.0
4  1485369350003          loft - 111 - Blouses     5.0
         user_id product_id  rating
0  1485369350003   00799605     1.0
1  1485369350003   01c00a9f     1.0
2  1485369350003   1389d2dd     1.0
3  1485369350003   1ff87a10     1.0
4  1485369350003   23583555     1.0
[1534200663687, 1540229647916, 1530899668174, 1495117820151, 1506291233259]


In [9]:
x1["base"] = "store_cat"
x2["base"] = "product"

full_ratings = x1.append(x2).reset_index(drop=True)
full_ratings.to_csv("../..//data/processed/all_ratings.csv",index=False)
x1.drop("base", axis=1, inplace=True)
x2.drop("base", axis=1, inplace=True)

In [10]:
print(full_ratings.head())
print(full_ratings.tail())

         user_id                    product_id  rating       base
0  1485369350003   dsw - 210 - Boots & Booties     2.0  store_cat
1  1485369350003              hm - 124 - Jeans     1.0  store_cat
2  1485369350003  jcrewfactory - 111 - Blouses     1.0  store_cat
3  1485369350003    jcrewfactory - 114 - Knits     1.0  store_cat
4  1485369350003          loft - 111 - Blouses     5.0  store_cat
             user_id product_id  rating     base
10336  1550516903728   94528138     1.0  product
10337  1550516903728   94944126     1.0  product
10338  1550516903728   98239982     1.0  product
10339  1550516903728   b8d0f825     1.0  product
10340  1550516903728   e4abcc38     1.0  product


In [11]:
def get_user_products(uid, df, n_count=10):
    cols_tokeep = ["product_id", "brand_id", "product_category_id", "rating","price_bin", "item_name_lower"]   
    col_names = ["Product ID", "Brand", "Category", "Times Bought","Price Bin", "Descr"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count).merge(products, on="product_id")[cols_tokeep] 
    user_top_products.columns = col_names
    return user_top_products

In [12]:
def get_user_storecats(uid, df, n_count=10):
    cols_tokeep = ["product_id", "rating"]   
    col_names = ["Store - Category", "Times Bought"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count)[cols_tokeep]
    user_top_products.columns = col_names
    return user_top_products

In [13]:
def get_n_recommendations(user_id, scores, n_recommendations=7):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "product_id", "brand_id", "product_category_id", "price_bin", "item_name_lower" ]
    col_names = ["Rating Estimate", "Product ID", "Brand", "Category", "Price Bin", "Descr"]
    temp_df = scores.sort_values("est", ascending=False).head(count).rename(columns={"iid":"product_id"}).merge(products, on="product_id")[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [14]:
def get_n_recommendations_cat(user_id, scores, n_recommendations=7):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "iid"]
    col_names = ["Rating Estimate", "Store - Category"]
    temp_df = scores.sort_values("est", ascending=False).head(count)[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [15]:
reader = Reader()
data1 = Dataset.load_from_df(x1,reader)
data2 = Dataset.load_from_df(x2,reader)

import re

def trainSurpriseModel(data, base, algo="SVD", search_iteration = 2):

    KNN_based = [KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline]
    SVD_based = [SVD, SVDpp]
    
    if (algo in SVD_based):
        param_grid = {
            'n_factors': [5,10,15,20],
            'n_epochs': [10, 20, 40, 80], 
            'lr_all': np.linspace(1e-4,2e-1,200),
            'reg_all': np.linspace(0,1,100),
            'random_state':[35]
        }
    
    if (algo == NMF):
        param_grid = {
            'n_factors': [5,10,15,20],
            'n_epochs': [10, 20, 40, 80], 
            'random_state':[35]
        }

    if (algo in KNN_based):
        param_grid = {
            'k': [1,3,5,10,15,20,25,30,35,40,45,50],
            'sim_options': {
                'name': ['msd', 'cosine'],
                'min_support': [1,3,5,10,15,20,25,30,35,40,45,50],
                'user_based': [False, True]},
            }
       
    print(param_grid)
    gs = RandomizedSearchCV(algo, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1, n_iter=search_iteration)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])
    print(gs.best_score['mae'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])
    print(gs.best_params['mae'])

    train_set = data.build_full_trainset()
    all_set = train_set.build_testset()
    anti_set = train_set.build_anti_testset()
    all_scores = pd.DataFrame(gs.test(all_set))
    anti_scores = pd.DataFrame(gs.test(anti_set))
    
    all_scores["algorithm"] = algo.__name__
    anti_scores["algorithm"] = algo.__name__    
    
    all_scores["score_type"] = "known"
    anti_scores["score_type"] = "anti"        

    all_scores["base"] = base
    anti_scores["base"] = base        


    return (all_scores, anti_scores)

In [38]:
algorithms = [SVD, SVDpp, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline]
n_iterations = 20

for algorithm in algorithms:
    print(f"\n****** Algorithm : {algorithm.__name__} ******")
    
    print("With Store - Cat Data ")
    all_s1, anti_s1 = trainSurpriseModel(data1, base="store_cat", algo=algorithm, search_iteration= n_iterations)

    print("\nWith Product Data ")
    all_s2, anti_s2 = trainSurpriseModel(data2, base="product", algo=algorithm, search_iteration= n_iterations)
    all_scores = all_s1.append(anti_s1).append(all_s2).append(anti_s2)
    all_scores.drop("details", axis=1, inplace=True)
    file_name = "../../data/processed/algo/all_reco_scores_" + algorithm.__name__ + ".csv"
    all_scores.to_csv(file_name,index=False)



****** Algorithm : SVD ******
With Store - Cat Data 
{'n_factors': [5, 10, 15, 20], 'n_epochs': [10, 20, 40, 80], 'lr_all': array([1.00000000e-04, 1.10452261e-03, 2.10904523e-03, 3.11356784e-03,
       4.11809045e-03, 5.12261307e-03, 6.12713568e-03, 7.13165829e-03,
       8.13618090e-03, 9.14070352e-03, 1.01452261e-02, 1.11497487e-02,
       1.21542714e-02, 1.31587940e-02, 1.41633166e-02, 1.51678392e-02,
       1.61723618e-02, 1.71768844e-02, 1.81814070e-02, 1.91859296e-02,
       2.01904523e-02, 2.11949749e-02, 2.21994975e-02, 2.32040201e-02,
       2.42085427e-02, 2.52130653e-02, 2.62175879e-02, 2.72221106e-02,
       2.82266332e-02, 2.92311558e-02, 3.02356784e-02, 3.12402010e-02,
       3.22447236e-02, 3.32492462e-02, 3.42537688e-02, 3.52582915e-02,
       3.62628141e-02, 3.72673367e-02, 3.82718593e-02, 3.92763819e-02,
       4.02809045e-02, 4.12854271e-02, 4.22899497e-02, 4.32944724e-02,
       4.42989950e-02, 4.53035176e-02, 4.63080402e-02, 4.73125628e-02,
       4.83170854e-02, 

0.14427013065934385
0.03542557984910844
{'n_factors': 15, 'n_epochs': 40, 'lr_all': 0.010145226130653266, 'reg_all': 0.8686868686868687, 'random_state': 35}
{'n_factors': 20, 'n_epochs': 20, 'lr_all': 0.05534874371859297, 'reg_all': 0.22222222222222224, 'random_state': 35}

****** Algorithm : SVDpp ******
With Store - Cat Data 
{'n_factors': [5, 10, 15, 20], 'n_epochs': [10, 20, 40, 80], 'lr_all': array([1.00000000e-04, 1.10452261e-03, 2.10904523e-03, 3.11356784e-03,
       4.11809045e-03, 5.12261307e-03, 6.12713568e-03, 7.13165829e-03,
       8.13618090e-03, 9.14070352e-03, 1.01452261e-02, 1.11497487e-02,
       1.21542714e-02, 1.31587940e-02, 1.41633166e-02, 1.51678392e-02,
       1.61723618e-02, 1.71768844e-02, 1.81814070e-02, 1.91859296e-02,
       2.01904523e-02, 2.11949749e-02, 2.21994975e-02, 2.32040201e-02,
       2.42085427e-02, 2.52130653e-02, 2.62175879e-02, 2.72221106e-02,
       2.82266332e-02, 2.92311558e-02, 3.02356784e-02, 3.12402010e-02,
       3.22447236e-02, 3.324924

0.14434104211976903
0.033828570064820825
{'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.00612713567839196, 'reg_all': 0.27272727272727276, 'random_state': 35}
{'n_factors': 10, 'n_epochs': 10, 'lr_all': 0.06338492462311558, 'reg_all': 0.4444444444444445, 'random_state': 35}

****** Algorithm : NMF ******
With Store - Cat Data 
{'n_factors': [5, 10, 15, 20], 'n_epochs': [10, 20, 40, 80], 'random_state': [35]}


ValueError: Cannot take a larger sample than population when 'replace=False'

In [39]:
mypath = "../../data/processed/algo/"
from os import walk

all_scores = pd.DataFrame()
for (dirpath, dirnames, filenames) in walk(mypath):
    for filename in filenames:
        print(f"reading... {filename} ")
        csv_file_name  = mypath + filename
        df = pd.read_csv(csv_file_name)
        print(f"rows read : {df.shape[0]}")
        all_scores = all_scores.append(df)
        print(f"Combined rows so far : {all_data.shape[0]}")

file_name = "../../data/processed/all_reco_scores.csv"
all_scores.to_csv(file_name,index=False)
all_scores = None


reading... all_reco_scores_SVD.csv 
rows read : 793941
Combined rows so far : 5557587
reading... all_reco_scores_KNNWithMeans.csv 
rows read : 793941
Combined rows so far : 5557587
reading... all_reco_scores_KNNBasic.csv 
rows read : 793941
Combined rows so far : 5557587
reading... all_reco_scores_KNNWithZScore.csv 
rows read : 793941
Combined rows so far : 5557587
reading... all_reco_scores_KNNBaseline.csv 
rows read : 793941
Combined rows so far : 5557587
reading... all_reco_scores_SVDpp.csv 
rows read : 793941
Combined rows so far : 5557587
reading... all_reco_scores_NMF.csv 
rows read : 793941
Combined rows so far : 5557587
