# Repurchase Model

The purpose of this notebook is to **build a predictive model that identifies which articles are most likely to be repurchased by a customer, of their previous purchases**.

## Import statements

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score,f1_score,precision_score,recall_score
from sklearn.model_selection import GridSearchCV,GroupKFold,RandomizedSearchCV

import lightgbm as lgbm

from tqdm import tqdm
tqdm.pandas()

## Read in data + fix data types

In [None]:
sample = ''

# Read in articles data
df_art = pd.read_csv('../Data/articles/articles'+sample+'.csv')
df_cust = pd.read_csv('../Data/customers/customers'+sample+'.csv')
df_trans = pd.read_csv('../Data/transactions_train/transactions_train'+sample+'.csv')

In [None]:
# Fix format of article IDs
df_art['article_id'] = df_art['article_id'].astype(str).str.zfill(10)
df_art['detail_desc'] = df_art['detail_desc'].astype(str)
df_trans['article_id'] = df_trans['article_id'].astype(str).str.zfill(10)

# # Fix datetime type
df_trans['t_dat'] = pd.to_datetime(df_trans['t_dat'])

# Build df_cust age brackets
df_cust['Age_Bracket'] = pd.cut(df_cust['age'],[1,19,29,39,49,59,200],labels=[1,2,3,4,5,6])

In [None]:
# Build training dataset by removing the last 7 days of data

test_start_date = '2020-09-09'
test_end_date = '2020-09-15'

df_trans_train = df_trans.loc[df_trans['t_dat'] < test_start_date,:].copy()
df_trans_test = df_trans.loc[(df_trans['t_dat'] >= test_start_date)&(df_trans['t_dat'] <= test_end_date),:].copy()

del df_trans
del df_trans_train
del df_art

# Evaluation function

In [None]:
# Takes in dataframe in submission format, returns MAP@12
def calculate_precision(df_temp):
    df = df_temp.copy()
    df[list(range(1,13))] = pd.DataFrame(df.prediction.str.split(' ').tolist(),index=df.index)
    del df['prediction']
    df2 = pd.melt(df,id_vars = ['customer_id'])
    df2.columns = ['customer_id','ranking','article_id']
    
    df_combined = pd.merge(df2,df_trans_test[['customer_id','article_id']].drop_duplicates(),how='inner')
    df_combined['CumCount'] = df_combined.sort_values(['customer_id','ranking']).groupby('customer_id').cumcount() + 1
    df_combined['Precision'] = df_combined['CumCount'] / df_combined['ranking']
#     return df_combined
    return df_combined['Precision'].sum()/(df_trans_test['customer_id'].nunique())

# Build General Pred

In [None]:
gen_pred_dict = pd.read_feather('../Datasets/gen_pred_dict.feather')

# Full Training Pipeline

This pipeline does the following - for each week we'd like to test:

- Create the training set of all repeat datasets up until a given 7 day window, and the evaluation set of the 7 day window in question
- Train 50 LightGB models, with previously established hyperparameters + each model with a different random sample of articles
- Score each potential repurchase in a given week against each model, and average the results to find the overall score for an article
- Identify all article/customer pairings with averaged scores >0.5, and output them as candidates for the full ranker model

In [None]:
pct_sample = ''

files = [\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0513_0519.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0520_0526.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0527_0602.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0603_0609.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0610_0616.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0617_0623.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0624_0630.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0701_0707.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0708_0714.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0715_0721.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0722_0728.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0729_0804.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0805_0811.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0812_0818.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0819_0825.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0826_0901.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0902_0908.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0909_0915.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0916_0922.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_FULL.feather'\
        ]

In [None]:
# Model parameters
sample = ''
n_iterations = 50
no_multiplier = 3

strt = dt.datetime.now()

for test_set_ind in range(4,20):
    print('Begin',files[test_set_ind],dt.datetime.now()-strt)
    print('Create Training Set',dt.datetime.now()-strt)
    df_train_set = pd.DataFrame()
    
    # Create the training set, randomly sample + concatenate all files prior to the test week
    for i,f in enumerate(files):
        if i == test_set_ind:
            break
        df_temp = pd.read_feather(f)
        df_yes = df_temp.loc[df_temp['Response'] == 1]
        df_train_set = pd.concat([df_train_set,df_yes,df_temp.loc[df_temp['Response']==0].sample(frac=1/(test_set_ind-1),random_state=i)])
        del df_temp
        del df_yes
    df_train_set = df_train_set.reset_index(drop=True)
    
    
    # Format the training set - identify all yes instances of repurchase
    print('Build Datasets',dt.datetime.now()-strt)
    df_prep = df_train_set.loc[:,[i for i in df_train_set.columns if i not in \
                                ['customer_id','article_id','t_dat']]].copy().sample(frac=1,random_state=11)
    if 'LastMonthPopularity' in df_prep.columns:
        del df_prep['LastMonthPopularity']
    yeses = df_prep.loc[df_prep['Response']==1].copy()
    del df_train_set
    
    
    print('Train Models',dt.datetime.now()-strt,len(yeses))

    models = []
    model_n = len(yeses)*no_multiplier

    for rs in range(n_iterations):
        if rs % 10 == 0:
            print(rs)
        # Create modeling dataset, pull yes instances + 10K no instances
        others = df_prep.loc[df_prep['Response']==0].copy().sample(n=model_n,random_state=rs)
        ys = yeses.sample(n=len(others),replace=True,random_state=rs)

        train = pd.concat([ys,others]).sample(frac=1,random_state=rs)
        trainX = train.loc[:,[i for i in train.columns if i != 'Response']]
        trainy = train['Response']

        # 5% sample dataset
#         xgb_clf = xgb.XGBClassifier(random_state=123,use_label_encoder=False,\
#                                    min_child_weight=5,colsample_bytree=.8,max_depth=4,verbosity=0,\
#                                    reg_alpha=.1,reg_lambda=.1,learning_rate=.1)

        # Full dataset
        xgb_clf = xgb.XGBClassifier(random_state=123,use_label_encoder=False,n_estimators=250,\
                                   min_child_weight=5,colsample_bytree=.8,max_depth=4,verbosity=0,\
                                   reg_alpha=.1,reg_lambda=.1,learning_rate=.14)

        xgb_clf.fit(trainX,trainy)

        models.append(xgb_clf)
        del others
        del ys
        del train
        del trainX
        del trainy

    del df_prep
    del yeses
    
    
    print('Score test set',dt.datetime.now()-strt)
    #READ IN TEST SET
    df_test_set = pd.read_feather(files[test_set_ind])
    if 'LastMonthPopularity' in df_test_set.columns:
        del df_test_set['LastMonthPopularity']

    # Create test data
    df_testX = df_test_set[[i for i in df_test_set.columns if i not in ['Response','customer_id','article_id','t_dat']]].copy()

    df_ensemble = df_test_set[['customer_id','article_id','t_dat']].copy()
    df_ensemble['sum_prob'] = 0

    del df_test_set
    
    # Sum the predictions of all models generated, and then average them
    for i,m in enumerate(models):
        df_ensemble['sum_prob'] += m.predict_proba(df_testX)[:,1]
    del df_testX

    df_ensemble['predict_prob'] = df_ensemble['sum_prob'] / len(models)
    df_ensemble['rank'] = df_ensemble.groupby('customer_id')['predict_prob'].rank('first',ascending=False)
    
    # Write the output!
    if 'FULL' in files[test_set_ind]:
        date_part = 'FULL'
    else:
        date_part = files[test_set_ind][-17:-8]
    
    df_ensemble[['customer_id','article_id','predict_prob','rank']].to_feather('../Datasets/Outputs'+sample+\
                                                                               '/RepeatFULL_'+date_part+'.feather')
    del df_ensemble

# Archive - Individual component tests

# Load in the dataset

In [None]:
pct_sample = ''

test_set_ind = 17

files = [\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0513_0519.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0520_0526.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0527_0602.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0603_0609.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0610_0616.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0617_0623.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0624_0630.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0701_0707.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0708_0714.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0715_0721.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0722_0728.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0729_0804.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0805_0811.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0812_0818.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0819_0825.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0826_0901.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0902_0908.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0909_0915.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_0916_0922.feather',\
         '../Datasets/Repeat'+pct_sample+'/Repurchase_FULL.feather'\
        ]

In [None]:
# COMBINE ALL TRAINING SETS

df_train_set = pd.DataFrame()

for i,f in enumerate(files):
    if i == test_set_ind:
        break
    df_temp = pd.read_feather(f)
    print(f,df_temp['t_dat'].max())
    df_yes = df_temp.loc[df_temp['Response'] == 1]
    df_train_set = pd.concat([df_train_set,df_yes,df_temp.loc[df_temp['Response']==0].sample(frac=1/(test_set_ind-1),random_state=i)])
    del df_temp
    del df_yes

df_train_set = df_train_set.reset_index(drop=True)
print(len(df_train_set))

df_prep = df_train_set.loc[:,[i for i in df_train_set.columns if i not in \
                            ['customer_id','article_id','t_dat']]].copy().sample(frac=1,random_state=11)
del df_prep['LastMonthPopularity']

yeses = df_prep.loc[df_prep['Response']==1].copy()

del df_train_set

## ENSEMBLING

## OPTUNA

I used Optuna to find the best possible hyperparameters for my LightGB model. The steps I used were:

- Generate 6 random samplings of the training set to be averaged
- For each one, tune a LightGB classification model using the provided hyperparameters, and **score the model based on recall**
    - e.g. what percent of actual repurchases scored >50% likelihood to repurchase in the model?

In [None]:
# Create the training sets for our Optuna models

trains = []
num_train_sets = 6

for it in range(num_train_sets):
    print(it)
    others = df_prep.loc[df_prep['Response']==0].copy().sample(n=300000,random_state=it)
    ys = yeses.sample(n=len(others),replace=True,random_state=it)
    
    train = pd.concat([ys,others]).sample(frac=1,random_state=it).reset_index(drop=True)
    trains.append(train)

In [None]:
# Create our test set to be scored in the Optuna model

#READ IN TEST SET
df_test_set = pd.read_feather(files[test_set_ind])
print(files[test_set_ind],len(df_test_set))
if 'LastMonthPopularity' in df_test_set.columns:
    del df_test_set['LastMonthPopularity']

np.random.seed(0)
rand_cust = np.random.choice(df_cust['customer_id'],size=600000,replace=False)    

# Create test output
testDF = df_test_set.loc[df_test_set['customer_id'].isin(rand_cust),['customer_id','article_id','Response']].copy()

# Create test data
df_testX = df_test_set.loc[df_test_set['customer_id'].isin(rand_cust),\
        [i for i in df_test_set.columns if i not in ['Response','customer_id','article_id','t_dat']]].copy()
del df_test_set

print(len(testDF))

num_repurchases = testDF['Response'].sum()
print(num_repurchases)

In [None]:
def objective_LGBM(trial):
    global testDF
    param = {
        'boosting_type': trial.suggest_categorical('boosting_type',['gbdt','goss']),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', .0001, 100),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', .0001, 100),
        'min_split_gain': trial.suggest_loguniform('min_split_gain',1e-4,15),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.8,0.9,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', .01,.4),
        'n_estimators': trial.suggest_int('n_estimators',100,400,step=10),
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 500),
        'num_leaves': trial.suggest_int('num_leaves', 8, 30)
    }
    
    strt = dt.datetime.now()

    recall = 0
    size_of_set = 0
    for i,t in enumerate(trains):
        yt = t['Response']
        Xt = t.drop('Response',axis=1)
        clf = lgbm.LGBMClassifier(random_state=123,**param)
        clf.fit(Xt,yt,callbacks=[lgbm.log_evaluation(period=0)])

        testDF['Pred'] = clf.predict(df_testX)
        testScore = testDF.loc[testDF['Pred'] >= 0.5].copy()
        
        recall += testScore['Response'].sum()
        size_of_set += len(testScore)

    print(recall/len(trains),size_of_set/len(trains))
    return (recall / len(trains)) / (size_of_set / len(trains))
    

In [None]:
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective_LGBM, n_trials=1000)

In [None]:
trial1 = study.trials
optuna.visualization.plot_slice(study)

### Filter down to top results per person, and format

In [None]:
def format_final_results(df):
    N = 12
    # Pivot the results, and concatenate together into the final format
    df_pivoted = pd.pivot(df,index='customer_id',columns='rank',values='article_id').fillna('')
    df_pivoted.columns = list(range(1,N+1))
    df_pivoted['pred_model_list'] = df_pivoted[list(range(1,N+1))].apply(\
                                    lambda row: ' '.join(row.values.astype(str)), axis=1).str.strip(' ')
    df_pivoted = df_pivoted[['pred_model_list']].copy().reset_index()
    
    # Join to the full list of customers and identify how many predictions are remaining for each
    df_final = df_cust[['customer_id']].copy()
    df_final = pd.merge(df_final,df_pivoted,how='left',on='customer_id')
    df_final['pred_model_list'] = df_final['pred_model_list'].fillna('').str.split(' ')
#     df_final['assn_list'] = build_assn_list()
    return df_final

df_final = format_final_results(df_valid_set)

## Association analysis - filter down to articles sold in the 2 weeks prior

In [None]:
# For each customer, take the top 3 associations (3 from #1, 2 from #1 and 1 from #2, or 1 from #1,#2,#3)

def build_assn_list():
    
    df_artdict = pd.read_csv('../Datasets/association_v2.csv')
    art_dict2 = {}
    for x in df_artdict.itertuples():
        art_dict2[str(x[1]).zfill(10)] = [str(j).zfill(10) for j in list(x[2:])]
    
    N = 3

    assn_list = []
    for x in tqdm(df_final.itertuples()):
        cust_id = x[1]
        pred_model_list = x[2]
        if len(pred_model_list) > 0:
            # Check if the predictions we've made show up in association dictionary
            matches = [i for i in pred_model_list if i in art_dict2]

            # If we have no matches, return nothing
            if len(matches) == 0:
                l = []

            # If we have 1 match, return the top 2 associations for that match
            elif len(matches) == 1:
                available = [i for i in art_dict2[matches[0]] if i not in pred_model_list]
                l = available[:N]

            # If we have 2+ matches return the top association for the top 2 articles that we match
            elif len(matches) == 2:
                available1 = [i for i in art_dict2[matches[0]] if i not in pred_model_list]
                available2 = [i for i in art_dict2[matches[1]] if i not in pred_model_list and i not in available1[:2]]
                l = available1[:2] + [available2[0]]
                if len(l) < 3:
                    print(l)
                    print(hello)
            else:
                available1 = [i for i in art_dict2[matches[0]] if i not in pred_model_list]
                available2 = [i for i in art_dict2[matches[1]] if i not in pred_model_list and i != available1[0]]
                available3 = [i for i in art_dict2[matches[2]] if i not in pred_model_list and i != available1[0] \
                                                                         and i != available2[0]]
                l = available1[:1] + available2[:1] + available3[:1]
                if len(l) < 3:
                    print(l)
                    print(hello)


        assn_list.append(l)
    return assn_list


df_final['Associations'] = build_assn_list()
df_final

In [None]:
# Association Analysis - find the number of customers who bought each article alongside target article

## READ IN ASSOCIATION DICTIONARY TO BYPASS THE LONG STEP

# df_artdict = pd.read_csv('../Datasets/association_v2.csv')
# art_dict2 = {}
# for x in df_artdict.itertuples():
#     art_dict2[str(x[1]).zfill(10)] = [str(j).zfill(10) for j in list(x[2:])]

# CREATE NEW DATASET (NOTE: THIS TAKES 1.1 HOURS TO RUN FOR 1500 ARTICLES)
# Truncate dataset to articles sold in the last 2 weeks, for scalability

# sold_last_week = df_trans.loc[df_trans['t_dat'] >= '2020-08-01','article_id'].unique()
# df_trans_train2 = df_trans.loc[df_trans['article_id'].isin(sold_last_week)].copy()
# top_articles = df_trans_train2['article_id'].value_counts()[:30000].index.tolist()
# ta = [i for i in top_articles if i not in art_dict2]

# for art_id in tqdm(ta):
#     if art_id in art_dict2:
#         continue
#     buyers = df_trans_train2.loc[df_trans_train2['article_id']==art_id,'customer_id'].unique()
#     others = df_trans_train2.loc[(df_trans_train2['customer_id'].isin(buyers))&(df_trans_train2['article_id'] != art_id),\
#                                'article_id'].value_counts()[:12]
#     art_dict2[art_id] = list(others.index)

## CONVERT ASSOCIATION DICTIONARY INTO A CSV SO WE DON'T NEED TO KEEP REPEATING THIS LONG PROCESS

# df_artdict = pd.DataFrame(art_dict2).T.reset_index().rename(columns={'index':'article_id'})
# df_artdict.to_csv('../Data/association_v2.csv',index=False)

## Fill remainder from general_pred, and output

### BEST SCORE TRACKER: 9/9 - 9/15

FULL SAMPLING
- No association analysis:
    - 1 XGB model: 0.06413008205486716
    - Ensemble of XGB: ~0.066
    - Best possible: 0.13619358293437947

5% SAMPLING

- JUST BASELINE: 0.014222965110588631
- No association analysis:
    - 1 XGB model, 40 trees: 0.06021429816020317
    - 30 Random Searches, 50-200 trees, early stopping, 2 wks training: 0.06279888629578777
    - 50 Random Searches, 50-200 trees, early stopping, 8 wks training: 0.06310117709380794
    
- Pre-changes: 1 model,  .0626 without association, .0657 with association
- Post-changes: 1 model, .0644 without association, .0674 with association

In [None]:
# Initialize our final score dataframe
df_score = df_final[['customer_id']].copy()

# Read in our cold start general prediction based on age bracket
df_final2 = pd.merge(df_final,df_cust[['customer_id','Age_Bracket']],how='left',on='customer_id').fillna(2)
df_final2 = pd.merge(df_final2,gen_pred_dict,on='Age_Bracket').sort_values(by='customer_id')

del df_final2['Age_Bracket']

# Build the final prediction:
# First - concatenate the list of repurchases
# Then - concatenate any purchase associations
# Finally - Append the cold start general predictions for anyone who still has fewer than 12 in their list

final_prediction = []
for x in tqdm(df_final2.itertuples()):
    ans = list(x[2]) + list(x[3]) + [i for i in list(x[4]) if i not in list(x[2]) and i not in list(x[3])]
    ans = [i for i in ans if i != ''][:12]
    if len(ans) > len(set(ans)) or len(ans) != 12:
        print(ans)
        break
    final_prediction.append(' '.join(ans))
df_score['prediction'] = final_prediction

# Calculate the MAP@12!
calculate_precision(df_score)