In [1]:
import os
import datetime
import pandas as pd
import numpy as np
from functions.split_data import split_data
#
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import minmax_scale

from recommenders.utils.python_utils import binarize
from recommenders.utils.timer import Timer
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    rmse,
    mae,
    logloss,
    rsquared,
    exp_var
)
from recommenders.models.sar import SAR
import sys



In [2]:
df = (
    pd.read_excel("/Users/Mac/Downloads/sales_demo_new.xlsx", nrows = 1000)
)
# create industries
industries = [f"industry_{x}" for x in range(1, 21)]
df["industry"] = np.random.choice(industries, size = df.shape[0])

In [3]:
print("Total customers: ", df["customer_id"].nunique())
print("-" * 40)
print("Total products: ", df["product_id"].nunique())
print("-" * 40)

print("First date: ", df["date"].min())
print("-" * 40)

print("Last date: ", df["date"].max())
print("-" * 40)

print("total transactions: ", df.shape[0])
print("-" * 40)

print("total columns beginning: ", df.shape[1])



Total customers:  195
----------------------------------------
Total products:  200
----------------------------------------
First date:  2023-01-01 00:00:00
----------------------------------------
Last date:  2023-01-07 00:00:00
----------------------------------------
total transactions:  1000
----------------------------------------
total columns beginning:  19


# Recommendations

In [5]:
sales = df.copy()
frame_products = sales.copy()
date_col = 'date'
item_col = 'product_id'
user_col = 'customer_id'


In [6]:
frame_products['timestamp'] = frame_products[date_col].astype('str').\
                                            apply(lambda x:
                                            int((datetime.datetime(int(x[:4]),\
                                            int(x[5:7]),\
                                            int(x[-2:]))-\
                                            datetime.datetime(1970, 1, 1)).total_seconds()))


In [None]:
from recommendations_score.score_feature_engineer import SalesFeatureEngineer
from recommendations_score.rating_functions import *

processor = SalesFeatureEngineer(frame_products)

processor.add_time_features()\
                    .add_customer_features()\
                    .add_behavioral_features()\
                    .add_preference_score()
                    
sales_df_class = processor.get_dataframe()
#
sales_df_class = compute_rating(df=sales_df_class)


In [8]:

frame_products_class = (sales_df_class[[user_col, item_col, "timestamp", "rating"]]
                        .rename(columns={
                        user_col:'userID',
                        item_col:'itemID',
                                }
                        ).drop_duplicates()
)


In [9]:
from functions.split_data import split_data
train, test = split_data(table=frame_products_class.copy())
test = test[test["userID"].isin(train["userID"].values)]

# Training

In [10]:
model = SAR(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_timestamp="timestamp",
    similarity_type="jaccard", 
    time_decay_coefficient=30, 
    timedecay_formula=True,
    normalize=True
)

In [11]:
with Timer() as train_time:
    model.fit(train.drop_duplicates())

print("Took {} seconds for training.".format(train_time.interval))

Took 0.024310541000005514 seconds for training.


In [12]:
with Timer() as test_time:
    top_k = model.recommend_k_items(test.drop_duplicates(), top_k=10, remove_seen=True) # suggest only unknown products for the customers

print("Took {} seconds for prediction.".format(test_time.interval))
#
test['itemID'] = test['itemID'].astype('int')
#
TOP_K = 10
eval_map = (
    map_at_k(
        test,
        top_k,
        col_user='userID',
        col_item='itemID',
        col_prediction='prediction',
        k=TOP_K
    )
)
eval_ndcg = (
    ndcg_at_k(
        test,
        top_k,
        col_user='userID',
        col_item='itemID',
        col_rating = 'rating',
        col_prediction='prediction', k=TOP_K)
)
eval_precision = precision_at_k(test, top_k, col_user='userID', col_item='itemID', col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, top_k, col_user='userID', col_item='itemID', col_prediction='prediction', k=TOP_K)
eval_rmse = rmse(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', col_prediction='prediction')
eval_mae = mae(test, top_k, col_user='userID', col_item='itemID', col_rating='rating', col_prediction='prediction')

Took 0.009513584000018227 seconds for prediction.


# Number of cases

In [13]:
test['userID'] = test['userID'].astype('str')
top_k['userID'] = top_k['userID'].astype('str')


In [None]:
test_pr = dict()
topk_pr = dict()
correct_pr = dict()
for c in test.userID.unique():
    test_pr[c] = dict()
    topk_pr[c] = dict()
    correct_pr[c] = dict()
    test_pr[c] = [test[test['userID'] == str(c)].shape[0]]
    topk_pr[c] = [top_k[top_k['userID'] == str(c)].shape[0]]
    correct_pr[c] = [test[(test['userID'] == str(c))&(test['itemID'].isin(top_k[top_k['userID'] == str(c)].itemID.tolist()))].shape[0]]


In [15]:
print(len([v for k,v in correct_pr.items() if v != 0]))
print('number_of_correct_prediction for sar: ', np.sum([v for k,v in correct_pr.items() if v != 0]))


141
number_of_correct_prediction for sar:  58


In [16]:
test_df = pd.DataFrame.from_dict(test_pr).T.reset_index(names='userID_test').rename(columns={0:'Total_transactions'})
topk_df = pd.DataFrame.from_dict(topk_pr).T.reset_index(names='userID_topk').rename(columns={0:'Total_recommendations'})
correct_df = pd.DataFrame.from_dict(correct_pr).T.reset_index(names='userID_correct').rename(columns={0:'Total_correct'})


In [18]:
results_df = (
    pd.concat([test_df, topk_df, correct_df], axis=1).drop(columns=['userID_topk', 'userID_correct'])
)
results_df.query("Total_correct > 0").Total_correct.sum()

58

# previous

In [123]:
print("Model:\t",
      "Top K:\t%d" % TOP_K,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall,
    #  "RMSE:\t%f" % eval_rmse,
     # "MAE:\t%f" % eval_mae,
    
      sep='\n')

Model:	
Top K:	10
MAP:	0.181370
NDCG:	0.523258
Precision@K:	0.192539
Recall@K:	0.043948
