Generate a prediction dataset to evluate and compare retrieval models

### Strategy
- Take a sample of customers (0.3) from original validation set (for all 3 strategies)
- Use only positive examples i.e. purchase == 1
- Use this sample to evaluate Evaluation Metrics such as Hit Rate and NDCG

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
import warnings

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 10)

In [None]:
path_model_data = Path("")

In [None]:
folder_s1 = 'CF_model_input_neucf_S1'
folder_s3 = 'CF_model_input_S3'
folder_neucf_s3 = 'CF_model_input_neucf_S3'
folder_s2 = 'CF_model_input_neucf_S2'
frac_cust_pred_ds = 0.3

In [None]:
def get_filtered_dfs(input_df, frac_):
    """ """

    unique_custs = input_df.customer_id.drop_duplicates().sample(frac=frac_)
    
    ## only positive interactions
    pd_oos_ranking_eval = (input_df[(input_df.customer_id.isin(unique_custs) & (input_df.purchase==1))]
                                 .filter(['customer_id', 'article_id'])
                                 .drop_duplicates())
    
    pd_val_set = input_df[~input_df.customer_id.isin(unique_custs)]

    return pd_oos_ranking_eval, pd_val_set

## Strategy 3 : Data For CF (with negative examples)

Each customer will have `n` negative examples per week. This dataset is created to train the model on a sample week.

For aggregated dataset over a period, each customer will have `n * frequency of purchase` negative samples. Rebalance the dataset for use in aggregate mode.

- for each year, month and week
- aggregate all products and select n random articles/products for each customer
- set `purchase` feature as 0

Difference between Strategy 3 and Strategy 1:
- Strategy three will have random sample taken from activity week. It does not follow the `leave-one-out` policy of NeuMF.

In [None]:
%%time

def get_ds_type(year, week):
    """ """
    # 2018 week-38 to 2020 Week-28
    if year < 2020:
        op = 'train'
    elif year == 2020 & week < 29:
        op = 'train'
    else:
        op = 'val'

    return op

def get_eval_set(path_, folder, n_neg, folder_write, frac_):
    """
    generates dataset with positive examples only for ranking based evaluation
    Resamples the weekly dataset to contain 30 negative examples
    
    """
    file_paths = list((path_/folder).rglob('*.parquet'))
    
    dfs_train = []
    dfs_val = []
    
    for file in file_paths:
        value = file.parts[-1].split('_')[0:2]
        res = get_ds_type(int(value[0]), int(value[1]))
        if res == 'val':
            dfs_val.append(pd.read_parquet(file))
        else:
            dfs_train.append(pd.read_parquet(file))
    
    pd_train_stg_three = pd.concat(dfs_train)
    pd_val_stg_three = pd.concat(dfs_val)
    
    dfs_train.clear(), dfs_val.clear()
    
    pd_custs_oos_ranking_eval, pd_validation_set = get_filtered_dfs(pd_val_stg_three, frac_)
    
    ## only positive interactions
    print(pd_train_stg_three.customer_id.nunique(), 
          pd_val_stg_three.customer_id.nunique(),
          pd_custs_oos_ranking_eval.shape, 
          pd_validation_set.customer_id.nunique())
    
    ## since strategy three is weekly `n` negative samples. 
    ## therefore the total number of negatives can be larger than `n`
    ## if the customers is active for more than 2 weeeks. Therefore resampling the negatives to `n`
    positives = pd_validation_set[pd_validation_set.purchase == 1]
    negatives = pd_validation_set[pd_validation_set.purchase == 0]
    
    groups_ = negatives.groupby(['customer_id'])
    
    df_list = []
    for grp in tqdm(groups_):
        df_list.append([grp[0], grp[1].article_id.sample(n=n_neg)])
    
    test_ = pd.DataFrame(df_list, columns=['customer_id', 'article_id'])
    df_list.clear()

    test_ = test_.explode(['article_id'])
    test_['purchase'] = 0
    
    assert int(test_.shape[0]/n_neg) ==  pd_validation_set.customer_id.nunique()

    validation = pd.concat([positives, test_]).astype('int32')

    pd_val_stg_three.to_parquet(path_/folder_write/'val_ds_stg_3.parquet')
    validation.to_parquet(path_/folder_write/'val_ds_sample_stg_3.parquet')
    pd_train_stg_three.to_parquet(path_/folder_write/'train_ds_stg_3.parquet')
    pd_custs_oos_ranking_eval.to_parquet(path_/'evaluation_set_stg_3.parquet')
    

get_eval_set(path_model_data, folder_s3, 30, folder_neucf_s3, frac_cust_pred_ds)

## Strategy ONE

In [None]:
%%time

val_stg_one = pd.read_parquet(path_model_data / folder_s1 / 'val_ds_stg_1.parquet')
pd_ranking_eval, pd_val = get_filtered_dfs(val_stg_one, frac_cust_pred_ds)

print(val_stg_one.customer_id.nunique(), pd_ranking_eval.shape, pd_val.customer_id.nunique())

pd_ranking_eval.to_parquet(path_model_data / 'evaluation_set_stg_1.parquet')
pd_val.to_parquet(path_model_data / folder_s1 / 'val_ds_sample_stg_1.parquet')

## Strategy Two

In [None]:
%%time

val_stg_two = pd.read_parquet(path_model_data / folder_s2 / 'val_ds_stg_2.parquet')
pd_ranking_eval, pd_val = get_filtered_dfs(val_stg_two, frac_cust_pred_ds)

print(val_stg_two.customer_id.nunique(), pd_ranking_eval.shape, pd_val.customer_id.nunique())

pd_ranking_eval.to_parquet(path_model_data / 'evaluation_set_stg_2.parquet')
pd_val.to_parquet(path_model_data / folder_s2 / 'val_ds_sample_stg_2.parquet')

## Verify the negative example in strategy one are different than strategy two

In [None]:
sample_prod_one = val_stg_one[(val_stg_one.customer_id==10) & (val_stg_one.purchase==0)].article_id
sample_prod_two = val_stg_two[(val_stg_two.customer_id==10) & (val_stg_two.purchase==0)].article_id

In [None]:
mask = np.isin(sample_prod_one, sample_prod_two)
sample_prod_one[~mask].shape