In [1]:
# !pip install scikit-surprise
# !conda install -c conda-forge scikit-surprise

In [2]:
import pandas as pd
import re
import numpy as np
import random
from scipy.stats import uniform as sp_rand
from scipy.stats import randint
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)


In [3]:
from surprise import Reader
from surprise import SVD, SVDpp
from surprise import KNNBasic, KNNBaseline, KNNWithZScore, KNNWithMeans
from surprise import accuracy
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV

In [4]:
my_seed = 55
random.seed(my_seed)
np.random.seed(my_seed)

In [5]:
df_users = pd.read_csv("../..//data/processed/users.csv")
df_items = pd.read_csv("../..//data/processed/items.csv")
df_items.item_name_lower = df_items.item_name_lower.map(lambda x : x[:25])
df_items.columns

Index(['user_id', 'brand_id', 'store_id', 'product_id', 'item_name_lower',
       'product_category_id', 'paid_price', 'size', 'on_sale', 'part_of_order',
       'top_brand', 'color', 'Category Name', 'price_bin'],
      dtype='object')

In [6]:
## Unique products
products =df_items.copy()
products.columns
products.drop(["user_id","store_id","paid_price","part_of_order","top_brand","color", "Category Name"],axis=1, inplace=True)
products.drop_duplicates(subset="product_id", inplace=True)
products.head()

Unnamed: 0,brand_id,product_id,item_name_lower,product_category_id,size,on_sale,price_bin
0,loft,62733a41,petite textured pencil pa,123 - Pants,,False,3
1,loft,7ca9f965,blurred fairisle sweater,114 - Knits,,False,3
2,loft,6273435d,lou grey eyelash sweater,114 - Knits,,False,3
3,loft,62732b46,petite plaid pencil pants,123 - Pants,,False,3
4,loft,627342fa,petite custom stretch tro,123 - Pants,,False,3


In [7]:
"""
This function creates a sparse matrix and a simple group by for user - product combinations
When store_cat is set to True, it uses product category and store as proxy for product 
Note : change this to create sparse matrices instead of returning pandas dataframes
"""
def get_user_prod_matrix(df, store_cat = True):
    df_items["store_cat"] = df_items.apply(lambda x : x["store_id"] + " - "+ str(x["product_category_id"]), axis=1)
    
    if store_cat:
        sparse = pd.pivot_table(df_items, index="user_id", columns="store_cat", values="product_id", aggfunc="count")
    else:
        sparse = pd.crosstab(index = df_items["user_id"], columns = df_items["product_id"], values="product_id", aggfunc="count")
    
    grp = sparse.stack().dropna().reset_index()       
    grp = grp.rename(columns={0:"rating"} )

    if store_cat:
        grp = grp.rename(columns = {"store_cat": "product_id"})


    return (grp, sparse)

In [9]:
x1, y1 = get_user_prod_matrix(df_items)
print(x1.head())
x2, y2 = get_user_prod_matrix(df_items, False)
print(x2.head())
unique_user_sample = x2.user_id.sample(5).values.tolist()
print(unique_user_sample)


         user_id                    product_id  rating
0  1485369350003   dsw - 210 - Boots & Booties     2.0
1  1485369350003              hm - 124 - Jeans     1.0
2  1485369350003  jcrewfactory - 111 - Blouses     1.0
3  1485369350003    jcrewfactory - 114 - Knits     1.0
4  1485369350003          loft - 111 - Blouses     5.0
         user_id product_id  rating
0  1485369350003   00799605     1.0
1  1485369350003   01c00a9f     1.0
2  1485369350003   1389d2dd     1.0
3  1485369350003   1ff87a10     1.0
4  1485369350003   23583555     1.0
[1534200663687, 1540229647916, 1530899668174, 1495117820151, 1506291233259]


In [10]:
x1["base"] = "store_cat"
x2["base"] = "product"

full_ratings = x1.append(x2).reset_index(drop=True)
full_ratings.to_csv("../..//data/processed/all_ratings.csv",index=False)
x1.drop("base", axis=1, inplace=True)
x2.drop("base", axis=1, inplace=True)

In [11]:
print(full_ratings.head())
print(full_ratings.tail())

         user_id                    product_id  rating       base
0  1485369350003   dsw - 210 - Boots & Booties     2.0  store_cat
1  1485369350003              hm - 124 - Jeans     1.0  store_cat
2  1485369350003  jcrewfactory - 111 - Blouses     1.0  store_cat
3  1485369350003    jcrewfactory - 114 - Knits     1.0  store_cat
4  1485369350003          loft - 111 - Blouses     5.0  store_cat
             user_id product_id  rating     base
10336  1550516903728   94528138     1.0  product
10337  1550516903728   94944126     1.0  product
10338  1550516903728   98239982     1.0  product
10339  1550516903728   b8d0f825     1.0  product
10340  1550516903728   e4abcc38     1.0  product


In [12]:
def get_user_products(uid, df, n_count=10):
    cols_tokeep = ["product_id", "brand_id", "product_category_id", "rating","price_bin", "item_name_lower"]   
    col_names = ["Product ID", "Brand", "Category", "Times Bought","Price Bin", "Descr"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count).merge(products, on="product_id")[cols_tokeep] 
    user_top_products.columns = col_names
    return user_top_products

In [13]:
def get_user_storecats(uid, df, n_count=10):
    cols_tokeep = ["product_id", "rating"]   
    col_names = ["Store - Category", "Times Bought"]   


    user_products = df[df.user_id==uid]
    count = min(user_products.shape[0], n_count)
    user_top_products = user_products.sort_values("rating",ascending=False).head(count)[cols_tokeep]
    user_top_products.columns = col_names
    return user_top_products

In [14]:
def get_n_recommendations(user_id, scores, n_recommendations=7):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "product_id", "brand_id", "product_category_id", "price_bin", "item_name_lower" ]
    col_names = ["Rating Estimate", "Product ID", "Brand", "Category", "Price Bin", "Descr"]
    temp_df = scores.sort_values("est", ascending=False).head(count).rename(columns={"iid":"product_id"}).merge(products, on="product_id")[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [15]:
def get_n_recommendations_cat(user_id, scores, n_recommendations=7):
    scores = scores[scores.uid == user_id]
    count = min(scores.shape[0], n_recommendations)
    cols_tokeep = ["est", "iid"]
    col_names = ["Rating Estimate", "Store - Category"]
    temp_df = scores.sort_values("est", ascending=False).head(count)[cols_tokeep]
    temp_df.columns = col_names
    return temp_df

In [17]:
reader = Reader()

In [19]:
data1 = Dataset.load_from_df(x1,reader)
data2 = Dataset.load_from_df(x2,reader)

def trainSVD(data, algo="SVD"):
    param_grid = {'n_epochs': [5, 10, 15], 'lr_all': np.linspace(1e-4,2e-1,200),
              'reg_all': np.linspace(0,1,100),
              'random_state':[35]
             }
    
    gs = RandomizedSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, refit=True,n_jobs=-1, n_iter=5)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])
    print(gs.best_score['mae'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])
    print(gs.best_params['mae'])

    train_set = data.build_full_trainset()
    all_set = train_set.build_testset()
    anti_set = train_set.build_anti_testset()
    all_scores = pd.DataFrame(gs.test(all_set))
    anti_scores = pd.DataFrame(gs.test(anti_set))
    
    all_scores["algorithm"] = algo
    anti_scores["algorithm"] = algo    
    
    all_scores["score_type"] = "known"
    anti_scores["score_type"] = "anti"        
    
    return (all_scores, anti_scores)

all_s1, anti_s1 = trainSVD(data1)
all_s2, anti_s2 = trainSVD(data2)






3.2708586847752663
1.6841220538437354
{'n_epochs': 5, 'lr_all': 0.00311356783919598, 'reg_all': 0.4141414141414142, 'random_state': 35}
{'n_epochs': 5, 'lr_all': 0.00311356783919598, 'reg_all': 0.4141414141414142, 'random_state': 35}
0.14383701408289076
0.03662271750792514
{'n_epochs': 5, 'lr_all': 0.008136180904522613, 'reg_all': 0.7777777777777778, 'random_state': 35}
{'n_epochs': 5, 'lr_all': 0.008136180904522613, 'reg_all': 0.7777777777777778, 'random_state': 35}


In [23]:
all_s1["base"] = "store_cat"
all_s2["base"] = "product"
anti_s1["base"] = "store_cat"
anti_s2["base"] = "product"

In [24]:
print(all_s1.head())
print(all_s2.head())
print(anti_s1.head())
print(anti_s2.head())

             uid                           iid  r_ui       est  \
0  1485369350003   dsw - 210 - Boots & Booties   2.0  2.551585   
1  1485369350003              hm - 124 - Jeans   1.0  2.678500   
2  1485369350003  jcrewfactory - 111 - Blouses   1.0  2.606214   
3  1485369350003    jcrewfactory - 114 - Knits   1.0  2.605062   
4  1485369350003          loft - 111 - Blouses   5.0  3.260074   

                     details algorithm score_type       base  
0  {'was_impossible': False}       SVD      known  store_cat  
1  {'was_impossible': False}       SVD      known  store_cat  
2  {'was_impossible': False}       SVD      known  store_cat  
3  {'was_impossible': False}       SVD      known  store_cat  
4  {'was_impossible': False}       SVD      known  store_cat  
             uid       iid  r_ui       est                    details  \
0  1485369350003  00799605   1.0  1.035169  {'was_impossible': False}   
1  1485369350003  01c00a9f   1.0  1.040048  {'was_impossible': False}   
2  148

In [30]:
all_scores = all_s1.append(anti_s1).append(all_s2).append(anti_s2)
all_scores.drop("details", axis=1, inplace=True)

In [31]:
all_scores.sample(10)

Unnamed: 0,uid,iid,r_ui,est,algorithm,score_type,base
507074,1539389830936,2a5226c2,1.018336,1.0,SVD,anti,product
59733,1504480921172,3409260e,1.018336,1.0,SVD,anti,product
94868,1514587641548,14669350,1.018336,1.021382,SVD,anti,product
238466,1531873923973,62553220,1.018336,1.038686,SVD,anti,product
315068,1534274912488,2a6bce45,1.018336,1.024356,SVD,anti,product
123546,1517962134688,26143242,1.018336,1.011149,SVD,anti,product
56918,1504480921172,4f652627,1.018336,1.0,SVD,anti,product
545985,1541215258673,d2771016,1.018336,1.033557,SVD,anti,product
566301,1542061238350,5b002741,1.018336,1.014132,SVD,anti,product
569880,1542061238350,35328195,1.018336,1.00885,SVD,anti,product


In [32]:
all_scores.to_csv("../..//data/processed/all_reco_scores.csv",index=False)