In [1]:
# !pip install scikit-surprise
# !conda install -c conda-forge scikit-surprise

In [2]:
import pandas as pd
# import re
import numpy as np
import random
from scipy.stats import uniform as sp_rand
from scipy.stats import randint
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)


import os.path
from os import path

In [3]:
from surprise import Reader
from surprise import SVD, SVDpp, NMF
from surprise import KNNBasic, KNNBaseline, KNNWithZScore, KNNWithMeans
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV


from lightfm import LightFM
from scipy.sparse import csr_matrix, coo_matrix
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split



In [4]:

def get_unique_products(df_items):
    print("\nGenerate unique products dataframe ...\n")
    products =df_items.copy()
    products.columns
    products.drop(["user_id","store_id","paid_price","part_of_order","top_brand","color", "Category Name"],axis=1, inplace=True)
    products.drop_duplicates(subset="product_id", inplace=True)
    print(products.head())
    return products

In [5]:
"""
This function creates a sparse matrix and a simple group by for user - product combinations
When store_cat is set to True, it uses product category and store as proxy for product 
Note : change this to create sparse matrices instead of returning pandas dataframes
"""
def get_user_prod_matrix(df, store_cat = True):
    df_items["store_cat"] = df_items.apply(lambda x : x["store_id"] + " - "+ str(x["product_category_id"]), axis=1)
    
    if store_cat:
        sparse = pd.pivot_table(df_items, index="user_id", columns="store_cat", values="product_id", aggfunc="count")
    else:
        sparse = pd.crosstab(index = df_items["user_id"], columns = df_items["product_id"], values="product_id", aggfunc="count")
    
    grp = sparse.stack().dropna().reset_index()       
    grp = grp.rename(columns={0:"rating"} )

    if store_cat:
        grp = grp.rename(columns = {"store_cat": "product_id"})


    return (grp, sparse)

In [6]:
def generate_interaction_files(items_data):

    print("\nGenerate interaction: user vs store-cat and user vs products...\n")
    x1, y1 = get_user_prod_matrix(items_data)
    x2, y2 = get_user_prod_matrix(items_data, False)
    # unique_user_sample = x2.user_id.sample(5).values.tolist()
    # print(unique_user_sample)

    x1["base"] = "store_cat"
    x2["base"] = "product"

    full_ratings = x1.append(x2).reset_index(drop=True)
    full_ratings.to_csv("../..//data/processed/all_ratings.csv",index=False)
    x1.drop("base", axis=1, inplace=True)
    x2.drop("base", axis=1, inplace=True) 
    return (x1,y1,x2,y2)

In [7]:
def get_user_products(uid, df, n_count=10):
    cols_tokeep = ["product_id", "brand_id", "product_category_id", "rating","price_bin", "item_name_lower"]   
    col_names = ["Product ID", "Brand", "Category", "Times Bought","Price Bin", "Descr"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count).merge(products, on="product_id")[cols_tokeep] 
    user_top_products.columns = col_names
    return user_top_products

In [8]:
def get_user_storecats(uid, df, n_count=10):
    cols_tokeep = ["product_id", "rating"]   
    col_names = ["Store - Category", "Times Bought"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count)[cols_tokeep]
    user_top_products.columns = col_names
    return user_top_products

In [9]:
def get_n_recommendations(user_id, scores, n_recommendations=7):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "product_id", "brand_id", "product_category_id", "price_bin", "item_name_lower" ]
    col_names = ["Rating Estimate", "Product ID", "Brand", "Category", "Price Bin", "Descr"]
    temp_df = scores.sort_values("est", ascending=False).head(count).rename(columns={"iid":"product_id"}).merge(products, on="product_id")[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [10]:
def get_n_recommendations_cat(user_id, scores, n_recommendations=7):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "iid"]
    col_names = ["Rating Estimate", "Store - Category"]
    temp_df = scores.sort_values("est", ascending=False).head(count)[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [11]:
def trainSurpriseModel(data, base, algo=SVD, search_iteration = 2):
    
    print(f"\n****** Training for Surprise model: {algo.__name__} with {base} data ...\n")
    
    KNN_based = [KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline]
    SVD_based = [SVD, SVDpp]
    
    if (algo in SVD_based):
        param_grid = {
            'n_factors': [5,10,15,20],
            'n_epochs': [10, 20, 40, 80], 
            'lr_all': np.linspace(1e-4,2e-1,200),
            'reg_all': np.linspace(0,1,100),
            'random_state':[35]
        }
    
    if (algo == NMF):
        param_grid = {
            'n_factors': [5,10,15,20],
            'n_epochs': [10, 20, 40, 80], 
            'random_state':[35]
        }

    if (algo in KNN_based):
        param_grid = {
            'k': [1,3,5,10,15,20,25,30,35,40,45,50],
            'sim_options': {
                'name': ['msd', 'cosine'],
                'min_support': [1,3,5,10,15,20,25,30,35,40,45,50],
                'user_based': [False, True]},
            }
       
    #print(param_grid)
    gs = RandomizedSearchCV(algo, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1, n_iter=search_iteration)
    gs.fit(data)

    # best RMSE score
    print(f"best RMSE score : {gs.best_score['rmse']}")
    print(f"best MAE score : {gs.best_score['mae']}")

    # combination of parameters that gave the best RMSE score
    print(f"best parms for RMSE score : {gs.best_params['rmse']}")
    print(f"best parms for MAE score : {gs.best_params['mae']}")

    train_set = data.build_full_trainset()
    all_set = train_set.build_testset()
    anti_set = train_set.build_anti_testset()
    all_scores = pd.DataFrame(gs.test(all_set))
    anti_scores = pd.DataFrame(gs.test(anti_set))
    
    all_scores["algorithm"] = algo.__name__
    anti_scores["algorithm"] = algo.__name__    
    
    all_scores["score_type"] = "known"
    anti_scores["score_type"] = "anti"        

    all_scores["base"] = base
    anti_scores["base"] = base        
    
    return (all_scores, anti_scores)

In [12]:
def trainLightFMModel(data, base, ratings, algo="LightFM"):
    
    print(f"\n****** Training for LightFM model: {algo} with {base} data ...\n")

    y1_sparse = coo_matrix(data.fillna(0).values, dtype = "float32")

    model = LightFM(no_components=10, loss= "warp")

    lt_train, lt_test = random_train_test_split(y1_sparse, 0.2)
    model.fit(lt_train, epochs=20)
    k = 7
    precision = precision_at_k(model, lt_train, k=5).mean()
    recall = recall_at_k(model, lt_train, k=5).mean()

    print(f"Precision@{k} :{precision}")
    print(f"Recall@{k} :{recall}")
    
    ## Refit with full data
    model.fit(y1_sparse, epochs=20)
    
    predictions = np.zeros(shape=y1_sparse.shape)
    cols = np.arange(y1_sparse.shape[1])
    for i in range(y1_sparse.shape[0]):
        predictions[i] = model.predict(i,cols)

    predictions_df = pd.DataFrame(predictions,columns=data.columns, index=data.index )
    predictions_df = predictions_df.stack().reset_index()
    predictions_df.columns = ["user_id","product_id","est"]
    predictions_df = pd.merge(predictions_df,ratings, on=["user_id","product_id"], how="left")

    predictions_df["score_type"] = "anti"
    predictions_df.loc[predictions_df.rating.notnull(), "score_type"] = "known"
    predictions_df["algorithm"] = algo
    predictions_df["base"] = base

    predictions_df.columns = ['uid', 'iid'] + predictions_df.columns.tolist()[2:]
    predictions_df = predictions_df[['uid', 'iid', 'rating', 'est', 'algorithm','score_type','base']]

    return predictions_df

In [13]:
def combine_scoring_files():
    mypath = "../../data/output/algo/"
    from os import walk

    all_scores = pd.DataFrame()
    for (dirpath, dirnames, filenames) in walk(mypath):
        for filename in filenames:
            print(f"reading... {filename} ")
            csv_file_name  = mypath + filename
            df = pd.read_csv(csv_file_name)
            print(f"rows read : {df.shape[0]}")
            all_scores = all_scores.append(df)
            print(f"Combined rows so far : {all_scores.shape[0]}")

    file_name = "../../data/output/all_reco_scores.csv"
    all_scores.to_csv(file_name,index=False)
    all_scores = None

In [14]:
def check_files_exist(data_path):
    path_to_check = data_path
    if not path.exists(path_to_check):
        raise ValueError(f"Path/file {path_to_check} does not exits")
    else:
        print(f"Valid file/path : {path_to_check}")

    path_to_check = data_path + "processed/users.csv"
    if not path.exists(path_to_check):
        raise ValueError(f"Path/file {path_to_check} does not exits")
    else:
        print(f"Valid file/path : {path_to_check}")

    path_to_check = data_path + "processed/items.csv"
    if not path.exists(path_to_check):
        raise ValueError(f"Path/file {path_to_check} does not exits")
    else:
        print(f"Valid file/path : {path_to_check}")
        
    path_to_check = data_path + "output/"
    if not path.exists(path_to_check):
        print(f"Creating folder {path_to_check}")
        os.makedirs(path_to_check)
    else:
        print(f"Valid file/path : {path_to_check}")

    path_to_check = data_path + "output/algo/"
    if not path.exists(path_to_check):
        print(f"Creating folder {path_to_check}")
        os.makedirs(path_to_check)
    else:
        print(f"Valid file/path : {path_to_check}")

In [15]:
if __name__ == '__main__':

    """ Set Seeds, but seems we need to set the seed anyway for each algorithm"""
    my_seed = 55
    random.seed(my_seed)
    np.random.seed(my_seed)

    files_path = "../../data/"

    check_files_exist(files_path)

    """Read item and user processed data"""
    df_users = pd.read_csv(files_path + "processed/users.csv")
    df_items = pd.read_csv(files_path + "processed/items.csv")
    df_items.item_name_lower = df_items.item_name_lower.map(lambda x : x[:25])
    products = get_unique_products(df_items)
    x1,y1,x2,y2 = generate_interaction_files(df_items)

    """ Run Surprise models """
    reader = Reader()
    data1 = Dataset.load_from_df(x1,reader)
    data2 = Dataset.load_from_df(x2,reader)

    algorithms = [SVD, SVDpp, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline]
    # algorithms = [SVD]

    n_iterations = 10

    for algorithm in algorithms:
        all_s1, anti_s1 = trainSurpriseModel(data1, base="store_cat", algo=algorithm, search_iteration= n_iterations)
        all_s2, anti_s2 = trainSurpriseModel(data2, base="product", algo=algorithm, search_iteration= n_iterations)
        all_scores = all_s1.append(anti_s1).append(all_s2).append(anti_s2)
        all_scores.drop("details", axis=1, inplace=True)
        file_name = files_path + "output/algo/all_reco_scores_" + algorithm.__name__ + ".csv"
        all_scores.to_csv(file_name,index=False)

    """Run LightFM Models"""
    base = "store-cat"
    algo_name = "LightFM_Basic"
    all_scores = trainLightFMModel(y1,base,x1,algo=algo_name)
    base = "product"
    all_scores2 = trainLightFMModel(y2,base,x2,algo=algo_name)

    all_scores = all_scores.append(all_scores2)
    file_name = files_path + "output/algo/all_reco_scores_" + algo_name + ".csv"
    all_scores.to_csv(file_name,index=False)


    """ Generate combined scoring file"""
    combine_scoring_files()

Valid file/path : ../../data/
Valid file/path : ../../data/processed/users.csv
Valid file/path : ../../data/processed/items.csv
Valid file/path : ../../data/output/
Valid file/path : ../../data/output/algo/

Generate unique products dataframe ...

  brand_id product_id            item_name_lower product_category_id size  \
0     loft   62733a41  petite textured pencil pa         123 - Pants  NaN   
1     loft   7ca9f965   blurred fairisle sweater         114 - Knits  NaN   
2     loft   6273435d   lou grey eyelash sweater         114 - Knits  NaN   
3     loft   62732b46  petite plaid pencil pants         123 - Pants  NaN   
4     loft   627342fa  petite custom stretch tro         123 - Pants  NaN   

   on_sale  price_bin  
0    False          3  
1    False          3  
2    False          3  
3    False          3  
4    False          3  

Generate interaction: user vs store-cat and user vs products...


****** Training for Surprise model: SVD with store_cat data ...

best RMSE sco



Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
best RMSE score : 0.1446834891325475
best MAE score : 0.035666965768671906
best parms for RMSE score : {'k': 5, 'sim_options': {'name': 'msd', 'min_support': 40, 'user_based': False}}
best parms for MAE score : {'k': 5, 'sim_options': {'name': 'msd', 'min_support': 40, 'user_based': False}}

****** Training for LightFM model: LightFM_Basic with store-cat data ...

Precision@7 :0.5586956143379211
Recall@7 :0.23119685245485805

****** Training for LightFM model: LightFM_Basic with product data ...

Precision@7 :0.758695662021637
Recall@7 :0.22223338689514666
reading... all_reco_scores_SVD.csv 
rows read : 793941
Combined rows so far : 793941
reading... all_reco_scores_KNNWithMeans.csv 
rows read : 793941
Combined rows so far : 1587882
reading... all_reco_scores_KNNBasic.csv 
rows read : 793941
Combined rows so far : 2381823
reading... all_reco_scores_LightFM_Basic.csv 
rows read : 7939

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Combined rows so far : 3175764
reading... all_reco_scores_KNNWithZScore.csv 
rows read : 793941
Combined rows so far : 3969705
reading... all_reco_scores_KNNBaseline.csv 
rows read : 793941
Combined rows so far : 4763646
reading... all_reco_scores_SVDpp.csv 
rows read : 793941
Combined rows so far : 5557587
reading... all_reco_scores_NMF.csv 
rows read : 793941
Combined rows so far : 6351528
