# Ranker Model

Read in the datasets created in the previous notebook, run an Optuna experiment to find the best possible hyperparameters, and train the final LightGB Ranker model to generate our final prediction for the Kaggle leaderboard!

## Import statements

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt
import re

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score,f1_score,precision_score,recall_score
from sklearn.model_selection import GridSearchCV,GroupKFold,RandomizedSearchCV

import lightgbm as lgbm

import optuna

from tqdm import tqdm
tqdm.pandas()

## Read in data + fix data types

In [2]:
sample = ''

# Read in articles data
df_art = pd.read_csv('../Data/articles/articles'+sample+'.csv')
df_cust = pd.read_csv('../Data/customers/customers'+sample+'.csv')
df_trans = pd.read_csv('../Data/transactions_train/transactions_train'+sample+'.csv')

In [3]:
# Fix format of article IDs
df_art['article_id'] = df_art['article_id'].astype(str).str.zfill(10)
df_art['detail_desc'] = df_art['detail_desc'].astype(str)
df_trans['article_id'] = df_trans['article_id'].astype(str).str.zfill(10)

# Fix datetime type
df_trans['t_dat'] = pd.to_datetime(df_trans['t_dat'])

# Build df_cust age brackets
df_cust['Age_Bracket'] = pd.cut(df_cust['age'],[1,19,29,39,49,59,200],labels=[1,2,3,4,5,6]).fillna(2)

# Update the color column for df_art
df_art['color'] = np.where(df_art['perceived_colour_master_name'].isin(['Blue','Turquoise','Bluish Green']),'Blue',\
                  np.where(df_art['perceived_colour_master_name'].isin(['Green','Yellowish Green','Khaki green']),'Green',\
                  np.where(df_art['perceived_colour_master_name'].isin(['Brown','Beige','Mole']),'Brown',\
                  np.where(df_art['perceived_colour_master_name'].isin(['Grey','Metal']),'Grey',\
                           df_art['perceived_colour_master_name']))))

In [4]:
# Build training dataset by removing the last 7 days of data

test_start_date = '2020-09-09'
test_end_date = '2020-09-15'

df_trans_train = df_trans.loc[df_trans['t_dat'] < test_start_date,:].copy()
df_trans_test = df_trans.loc[(df_trans['t_dat'] >= test_start_date)&(df_trans['t_dat'] <= test_end_date),:].copy()

# Evaluation function

In [5]:
# Takes in dataframe in submission format, returns MAP@12
def calculate_precision(df_temp):
    df = df_temp.copy()
    df[list(range(1,13))] = pd.DataFrame(df.prediction.str.split(' ').tolist(),index=df.index)
    del df['prediction']
    df2 = pd.melt(df,id_vars = ['customer_id'])
    df2.columns = ['customer_id','ranking','article_id']
    
    df_combined = pd.merge(df2,df_trans_test[['customer_id','article_id']].drop_duplicates(),how='inner')
    df_combined['CumCount'] = df_combined.sort_values(['customer_id','ranking']).groupby('customer_id').cumcount() + 1
    df_combined['Precision'] = df_combined['CumCount'] / df_combined['ranking']
    return df_combined['Precision'].sum()/(df_trans_test['customer_id'].nunique())

# Load in the dataset

In [6]:
pct_sample = ''
filename = 'Full'

# IDENTIFY TEST SET IN FILES INDEX
test_set_ind = 15

files = [\
         '../Datasets/Full'+pct_sample+'/Repurchase_0610_0616_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0617_0623_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0624_0630_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0701_0707_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0708_0714_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0715_0721_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0722_0728_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0729_0804_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0805_0811_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0812_0818_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0819_0825_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0826_0901_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0902_0908_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0909_0915_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_0916_0922_yes.feather',\
         '../Datasets/Full'+pct_sample+'/Repurchase_FULL_yes.feather'\
        ]

In [7]:
# COMBINE ALL TRAINING SETS

df_train_set = pd.DataFrame()

for i,f in enumerate(files):
    if i == test_set_ind:
        break
    print(f)
    df_temp = pd.read_feather(f)
    df_temp['Key'] = df_temp['customer_id'].str[:20]+'_'+str(i)
    df_yeses = df_temp.loc[df_temp['Response'] == 1,'customer_id'].unique()
    df_train_set = pd.concat([df_train_set,df_temp.loc[df_temp['customer_id'].isin(df_yeses)]])
    del df_temp
    del df_yeses

df_train_set = df_train_set.reset_index(drop=True)
print(len(df_train_set))

del df_train_set['article_id']
del df_train_set['customer_id']
if 'HasCustomerBoughtArticle' in df_train_set.columns:
    del df_train_set['HasCustomerBoughtArticle']
    
df_train_set['SaleChangeLastWeek'] = df_train_set['PriceLastWeekVsMean'] / df_train_set['Price2WeeksAgoVsMean']
df_train_set['SaleChangeLastMonth'] = df_train_set['PriceLastWeekVsMean'] / df_train_set['PriceLastMonthVsMean']

../Datasets/Full_Test/Repurchase_0819_0825_yes.feather
../Datasets/Full_Test/Repurchase_0826_0901_yes.feather
../Datasets/Full_Test/Repurchase_0902_0908_yes.feather
1936439


# ENSEMBLING

## OPTUNA

Determine the best possible hyperparameters for the LightGB model

Score the experiments against the **MAP@12 of the result**!

In [None]:
# Create modeling dataset, pull yes instances + 10K no instances
df = df_train_set.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)).sort_values(by=['Key'])
del df_train_set

qids_train = df.groupby('Key')['Key'].count().to_numpy()
X_train = df.drop(['Response','Key'], axis=1)
y_train = df["Response"]

In [None]:
#READ IN TEST SET
df_test_set = pd.read_feather(files[test_set_ind])
print(files[test_set_ind],len(df_test_set))

if 'HasCustomerBoughtArticle' in df_test_set.columns:
        del df_test_set['HasCustomerBoughtArticle']

purchase_cust = df_test_set.loc[df_test_set['Response']==1,'customer_id']

# Create test output
testDF = df_test_set.loc[df_test_set['customer_id'].isin(purchase_cust),['customer_id','article_id','Response']].copy()

# Create test data (only need response variable if create_test flag is set to true)
df_testX = df_test_set.loc[df_test_set['customer_id'].isin(purchase_cust),\
        [i for i in df_test_set.columns if i not in ['Response','customer_id','article_id','t_dat']]].copy()
del df_test_set
del purchase_cust

print(len(testDF))

n_custs_w_purchase = testDF.loc[testDF['Response']==1,'customer_id'].nunique()
print(n_custs_w_purchase)

In [None]:
def objective(trial):
    global testDF
    param = {
        'reg_lambda': trial.suggest_loguniform('reg_lambda', .0001, 100),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', .0001, 100),
        'min_split_gain': trial.suggest_loguniform('min_split_gain',1e-4,15),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.8,0.9,1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', .01,.4),
        'n_estimators': trial.suggest_int('n_estimators',300,800,step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 6),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 500),
        'num_leaves': trial.suggest_int('num_leaves', 8, 60)
    }
    
    lgbr = lgbm.LGBMRanker(boosting_type='goss',random_state=123,metric='map',**param)

    lgbr.fit(X_train,y_train,group=qids_train,callbacks=[lgbm.log_evaluation(period=0)])

    # Calculate the MAP@12 of the result
    testDF['Pred'] = lgbr.predict(df_testX)
    testDF['Rank'] = testDF.groupby('customer_id')['Pred'].rank('first',ascending=False)
    testScore = testDF.loc[(testDF['Rank'] <= 12)&(testDF['Response']==1)].sort_values(by=['customer_id','Rank'])

    testScore['CumCount'] = testScore.sort_values(['customer_id','Rank']).groupby('customer_id').cumcount() + 1
    testScore['Precision'] = testScore['CumCount'] / testScore['Rank']
    return testScore['Precision'].sum()/n_custs_w_purchase

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
trial1 = study.trials
optuna.visualization.plot_slice(study)

## LightGBM Ranker

Build a final LGBMRanker model against the output hyperparameters of the Optuna experiment!

- Train 10 models, and average the results

In [9]:
## SET UP THE DATASET

# Create modeling dataset, pull yes instances + 10K no instances
df = df_train_set.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x)).sort_values(by=['Key'])
del df_train_set

qids_train = df.groupby('Key')['Key'].count().to_numpy()
X_train = df.drop(['Response','Key'], axis=1)
y_train = df["Response"]

# 5% Data
# params = {'reg_lambda': 4.222088769929572, 'reg_alpha': 1.3345544183135867, 'min_split_gain': 0.005242797796112651,\
#           'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.039022659722173045, 'n_estimators': 420,\
#           'max_depth': 4, 'min_child_weight': 316, 'num_leaves': 8}

# Full Data
params = {'reg_lambda': 35.71755773124274, 'reg_alpha': 0.2518721758654253, 'min_split_gain': 0.00011623209704661628,\
          'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.11023533624106883, 'n_estimators': 700,\
          'max_depth': 5, 'min_child_weight': 450, 'num_leaves': 15}

# Full test
models = []
strt = dt.datetime.now()
for rs in range(10):
    print(rs,dt.datetime.now()-strt)
    lgbr = lgbm.LGBMRanker(boosting_type='goss',random_state=123,metric='map',**params)

    lgbr.fit(X_train,y_train,
            group=qids_train,
            callbacks=[lgbm.log_evaluation(period=0)])
    
    models.append(lgbr)

0 0:00:00
1 0:01:07.505210
2 0:02:15.632743
3 0:03:25.566043
4 0:04:33.642781
5 0:05:42.010696
6 0:06:59.000730
7 0:08:07.893584
8 0:09:16.573622
9 0:10:26.566826


In [10]:
# Calculate the feature importances of the model
{X_train.columns[i]:lgbr.feature_importances_[i] for i in range(len(X_train.columns))}

{'IsRepeat': 7,
 'IsLast30Days': 4,
 'IsAssociation': 72,
 'IsGenPred': 46,
 'IsNeighbor': 47,
 'NumListsAppeared': 69,
 'UniqueDaysCustBoughtArt': 45,
 'DaysSinceCustLastPurchasedArt': 370,
 'age': 121,
 'WeeksFromPeak': 128,
 'PriceLastWeekVsMean': 232,
 'CustomerPropensityToBuySales': 236,
 'ArtCustSalePropensity': 100,
 'Price2WeeksAgoVsMean': 174,
 'PriceLastMonthVsMean': 129,
 'GenderPropensity': 229,
 'AgePropensity': 241,
 'PctTimeFrame': 255,
 'garment_group_noPopularityLastWeek': 233,
 'product_type_noPopularityLastWeek': 239,
 'section_noPopularityLastWeek': 263,
 'RepurchaseFactor_cust': 276,
 'RepurchaseFactor_art': 378,
 'MedianAge': 180,
 'YearsFromMedianAge': 308,
 'DaysSinceFirstSold': 251,
 'DaysSinceLastSold': 32,
 'AverageSalePriceOverall': 222,
 'AverageSalePriceLastWk': 227,
 'PriceOverallStdFromCustomerMean': 179,
 'PriceLastWkStdFromCustomerMean': 247,
 'PctOfPeakWeeklySales': 0,
 'PurchaseRatePerWeek': 232,
 'PurchaseRateLastWeek': 227,
 'LastWeekPopularity': 2

In [11]:
def predict_on_test(file):
    #READ IN TEST SET
    df_test_set = pd.read_feather(file)
    print(file,len(df_test_set))
    if 'HasCustomerBoughtArticle' in df_test_set.columns:
        del df_test_set['HasCustomerBoughtArticle']
        
    df_test_set['SaleChangeLastWeek'] = df_test_set['PriceLastWeekVsMean'] / df_test_set['Price2WeeksAgoVsMean']
    df_test_set['SaleChangeLastMonth'] = df_test_set['PriceLastWeekVsMean'] / df_test_set['PriceLastMonthVsMean']

    # Create test output
    testDF = df_test_set[['customer_id','article_id']].copy()

    # Create test dataset for LGBMRanker model
    df_testX = df_test_set[[i for i in df_test_set.columns if i not in ['Response','customer_id','article_id','t_dat']]].copy()
    del df_test_set

    strt = dt.datetime.now()

    print('Start Predict')
    testDF['predict_score'] = lgbr.predict(df_testX)
    print('Done',dt.datetime.now()-strt)
    del df_testX

    # Rank the candidates for each customer
    testDF['rank'] = testDF.groupby('customer_id')['predict_score'].rank('first',ascending=False)
    
    # Filter model predictions down to top N for each person, and must be above threshold
    N = 12

    df_ans = testDF.loc[(testDF['rank'] <= N),['customer_id','article_id','predict_score','rank']].copy()
    del testDF
    return df_ans

In [12]:
df_valid_set = predict_on_test(files[test_set_ind])

../Datasets/Full_Test/Repurchase_0909_0915_yes.feather 627460
Start Predict
Done 0:00:04.911810


### Format the outputs to match final submission template

In [13]:
def format_outputs(df_vs):
    N = 12
    # Pivot the results, and concatenate together into the final format
    df_pivoted = pd.pivot(df_vs,index='customer_id',columns='rank',values='article_id').fillna('')
    df_pivoted.columns = list(range(1,N+1))
    df_pivoted['pred_model_list'] = df_pivoted[list(range(1,N+1))].apply(\
                                    lambda row: ' '.join(row.values.astype(str)), axis=1).str.strip(' ')
    df_pivoted = df_pivoted[['pred_model_list']].copy().reset_index()
    
    
    # Join to the full list of customers and identify how many predictions are remaining for each
    df_final = df_pivoted[['customer_id']].drop_duplicates().copy()
    df_final = pd.merge(df_final,df_pivoted,how='left',on='customer_id')
    df_final['pred_model_list'] = df_final['pred_model_list'].fillna('').str.split(' ')
    
    df_final['prediction'] = df_final['pred_model_list'].apply(lambda x:' '.join(x))
    del df_final['pred_model_list']
    return df_final

In [14]:
df_final = format_outputs(df_valid_set)

print(len(df_final.loc[df_final['prediction'].str.count(' ') != 11]))
print(len(df_final.loc[df_final['prediction'].str[0]==' ']))
print(len(df_final.loc[df_final['prediction'].str[-1]==' ']))

display(df_final)

0
0
0


Unnamed: 0,customer_id,prediction
0,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0706016001 0448509014 0715624001 0673677002 07...
1,000fb6e772c5d0023892065e659963da90b1866035558e...,0831684001 0740519002 0158340001 0915529003 09...
2,00125440be6cd148c3599b9c5a2d55f5838c1b0257d356...,0892279001 0837443002 0772902010 0572998001 04...
3,002156b708c7c6dd8afe31a743131d13b1e5dcbf2ce8c4...,0896152002 0897146002 0896152001 0896152003 08...
4,00281c683a8eb0942e22d88275ad756309895813e0648d...,0849591004 0870528003 0849591001 0870528002 08...
...,...,...
11734,fff2282977442e327b45d8c89afde25617d00124d0f999...,0919365008 0919786001 0919786002 0889496001 09...
11735,fff2ef796ece5299d08227c49353043a92d61a3cdf4880...,0706016002 0797710001 0669091003 0851606001 07...
11736,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,0614854005 0351484039 0448509018 0858147001 08...
11737,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0914441004 0652924004 0914441001 0706016001 07...


## OUTPUT

### BEST SCORE TRACKER: 9/9 - 9/15

FULL SAMPLING
- No association analysis:
    - 1 XGB model: 0.06413008205486716
    - Ensemble of XGB: ~0.066
    - Repeat_0909_0915 + association + gen_pred_dict age: 0.0675903687192387
    - Best possible: 0.13619358293437947
    - Ranking model, full dataset, 1 LGBR: 0.07444375549285465
    - Ranking model, only power users dataset + defaults for others, 1 LGBR: 0.07063352760425903 (WORSE)
    - Ranking model, updated dataset + new hyperparameters, 20 LGBR: 0.07354104507494727

5% SAMPLING

- JUST BASELINE: 0.014222965110588631
- No association analysis:
    - 1 XGB model, 40 trees: 0.06021429816020317
    - 30 Random Searches, 50-200 trees, early stopping, 2 wks training: 0.06279888629578777
    - 50 Random Searches, 50-200 trees, early stopping, 8 wks training: 0.06310117709380794
    
- With ranking:
    - 1 static model: 0.07120919702564296
    - 20 static models: 0.07292343488960441
    - 50 static models: 0.07342069763355963

In [15]:
calculate_precision(df_final)

0.07544701457349554

# Write results to csv

In [None]:
output_df = df_final
filename = 'rankerNewOptunaHP_RepeatNoThres_12assn_last30days_genpredage_neighbors12_7wktrain_v2.csv'

output_df.to_csv('../Submissions/' + filename,index=False)