In [1]:
import os
import pandas as pd
import numpy as np
import s3fs
from datetime import datetime
from datetime import timedelta
import calendar
import boto3
import io
import ast

## Load the data

In [2]:
sample_sub_df = pd.read_csv('data/sample_submission_2_adj.csv').drop(["Unnamed: 0"], axis = 1)
sample_sub_df.head(1)

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02


In [3]:
# Read labels
label_1_df = pd.read_csv("data/label_1.csv", header = None)\
               .rename(columns = {0:"user_id_hash", 
                                  1:"user_purchase_binary_7_days"})
label_2_df = pd.read_csv("data/label_2.csv", header = None)\
               .rename(columns = {0:"user_id_hash", 
                                  1:"user_purchase_binary_14_days"})
label_1_df.head(1)

Unnamed: 0,user_id_hash,user_purchase_binary_7_days
0,bf676611754201cc93ca1e2bcce8ca2c7ff0105186ebc0...,1.0


### Data Dictionary for Total
0 - user_id  
1 - total number of sessions  
2 - total number of events  
3 - median number of events  
4 - total sum of purchases  
5 - total amount of purchases  
6 - median session duration

In [4]:
# Read Total
total_train_df = pd.read_csv("data/train_total.csv", header = None)\
                   .rename(columns = {0:"user_id_hash", 
                                      1:"num_sessions",
                                      2:"num_events",
                                      3:"median_num_events",
                                      4:"num_purchases",
                                      5:"amt_purchases",
                                      6:"median_session_duration"})
total_test_df = pd.read_csv("data/test_total.csv", header = None)\
                  .rename(columns = {0:"user_id_hash", 
                                     1:"num_sessions",
                                     2:"num_events",
                                     3:"median_num_events",
                                     4:"num_purchases",
                                     5:"amt_purchases",
                                     6:"median_session_duration"})
total_train_df.head(1)

Unnamed: 0,user_id_hash,num_sessions,num_events,median_num_events,num_purchases,amt_purchases,median_session_duration
0,0d261313961125c71f330a8a8d59857bc48fefd0b4278c...,1,35,35.0,0,0.0,1626343.0


In [5]:
# Read Last 7 Days
train_7_df = pd.read_csv("feature_eng/train_last_7_days.csv", header = None)\
                   .rename(columns = {0:"user_id_hash", 
                                      1:"num_sessions_7",
                                      2:"num_events_7",
                                      3:"median_num_events_7",
                                      4:"num_purchases_7",
                                      5:"amt_purchases_7",
                                      6:"median_session_duration_7"})
test_7_df = pd.read_csv("feature_eng/test_last_7_days.csv", header = None)\
                  .rename(columns = {0:"user_id_hash", 
                                     1:"num_sessions_7",
                                     2:"num_events_7",
                                     3:"median_num_events_7",
                                     4:"num_purchases_7",
                                     5:"amt_purchases_7",
                                     6:"median_session_duration_7"})
train_7_df.head(1)

Unnamed: 0,user_id_hash,num_sessions_7,num_events_7,median_num_events_7,num_purchases_7,amt_purchases_7,median_session_duration_7
0,c7385007b6f158e913411d1563d1f5ef90ea003050b105...,1,1,1.0,0,0.0,0.0


In [6]:
# Read Last 14 Days
train_14_df = pd.read_csv("feature_eng/train_last_14_days.csv", header = None)\
                   .rename(columns = {0:"user_id_hash", 
                                      1:"num_sessions_14",
                                      2:"num_events_14",
                                      3:"median_num_events_14",
                                      4:"num_purchases_14",
                                      5:"amt_purchases_14",
                                      6:"median_session_duration_14"})
test_14_df = pd.read_csv("feature_eng/test_last_14_days.csv", header = None)\
                  .rename(columns = {0:"user_id_hash", 
                                     1:"num_sessions_14",
                                     2:"num_events_14",
                                     3:"median_num_events_14",
                                     4:"num_purchases_14",
                                     5:"amt_purchases_14",
                                     6:"median_session_duration_14"})
train_14_df.head(1)

Unnamed: 0,user_id_hash,num_sessions_14,num_events_14,median_num_events_14,num_purchases_14,amt_purchases_14,median_session_duration_14
0,a7d68ea7dcf1bc9fc2d455c19d1ba409951609735a543b...,1,1,1.0,0,0.0,0.0


### Data Dictionary for Sessions
0 - user_id  
1 - date_user_created   
2 - most_freq_country  
3 - most_freq_os  
4 - num_uniq_device_id

In [7]:
# Read Sessions
sessions_df = pd.read_csv("feature_eng/sessions.csv", header = None)\
                   .rename(columns = {0:"user_id_hash", 
                                      1:"date_user_created",
                                      2:"most_freq_country",
                                      3:"most_freq_os",
                                      4:"num_uniq_device_id"})\
                   .dropna()
sessions_df.head(1)

Unnamed: 0,user_id_hash,date_user_created,most_freq_country,most_freq_os,num_uniq_device_id
0,46d54b4fab292c461cdf8f3825f7106907b300b2d28cf3...,2018-10-19 22:04:58,US,iOS,1


## Feature Engineering

In [8]:
train_fe = pd.merge(total_train_df, sessions_df, how = "left").dropna()
train_fe = pd.merge(train_fe, train_7_df, how = "left").fillna(0)
train_fe = pd.merge(train_fe, train_14_df, how = "left").fillna(0)
test_fe = pd.merge(total_test_df, sessions_df, how = "left").dropna()
test_fe = pd.merge(test_fe, test_7_df, how = "left").fillna(0)
test_fe = pd.merge(test_fe, test_14_df, how = "left").fillna(0)

In [9]:
# Check null values
for column in train_fe.columns:
    if train_fe[column].isnull().any():
        print('{0} has {1} null values'.format(column, train_fe[column].isnull().sum()))
for column in test_fe.columns:
    if test_fe[column].isnull().any():
        print('{0} has {1} null values'.format(column, test_fe[column].isnull().sum()))

### Create column for Number of Days Existed

In [10]:
# Create column for Number of Days Existed
# For training, compute days between first date and Dec.2
# For testing, compute days between first date and Dec.16
train_fe["num_days_existed"] = [(datetime(2018, 12, 2) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                for date in train_fe.date_user_created.values]
test_fe["num_days_existed"] = [(datetime(2018, 12, 16) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                for date in test_fe.date_user_created.values]

### Create column for Days Since Most Recent Session

In [11]:
most_recent_train_df = pd.read_csv('feature_eng/most_recent_session_train.csv', header = None)\
                         .rename(columns = {0:"user_id_hash", 1:"most_recent_session"})
most_recent_test_df = pd.read_csv('feature_eng/most_recent_session_test.csv', header = None)\
                        .rename(columns = {0:"user_id_hash", 1:"most_recent_session"})

In [12]:
train_fe_adj = pd.merge(train_fe, most_recent_train_df, how = 'left')
test_fe_adj = pd.merge(test_fe, most_recent_test_df, how = 'left')
train_fe_adj2 = train_fe_adj
test_fe_adj2 = test_fe_adj
train_fe_adj2["most_recent_session"] = train_fe_adj["most_recent_session"].fillna(train_fe_adj["date_user_created"])
test_fe_adj2["most_recent_session"] = test_fe_adj["most_recent_session"].fillna(test_fe_adj["date_user_created"])

In [13]:
train_fe["days_since_last_session"] = [(datetime(2018, 12, 2) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                        for date in train_fe_adj2.most_recent_session.values]
test_fe["days_since_last_session"] = [(datetime(2018, 12, 16) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                        for date in test_fe_adj2.most_recent_session.values]

### Create column for frequent OS

In [14]:
train_fe["most_freq_os_android"] = [1 if x == 'Android OS' else 0 for x in train_fe.most_freq_os.values]
train_fe["most_freq_os_ios"] = [1 if x != 'Android OS' else 0 for x in train_fe.most_freq_os.values]
test_fe["most_freq_os_android"] = [1 if x == 'Android OS' else 0 for x in test_fe.most_freq_os.values]
test_fe["most_freq_os_ios"] = [1 if x != 'Android OS' else 0 for x in test_fe.most_freq_os.values]

### Create dummies regarding regions

In [15]:
country_code_df = pd.read_csv("country_code.csv", header = None)
country_code_df = country_code_df.loc[1:,[1,5,6]]
country_code_df = country_code_df[country_code_df[1] != 'AQ'].dropna()
country_code_dict = {}
country_code_dict['ZZ'] = "Unknown"
country_code_dict['XK'] = "Southern Europe"
country_code_dict['NAM'] = "Sub-Saharan Africa"
for country_code in country_code_df[1].values:
    country_code_dict[country_code] = country_code_df[country_code_df[1]==country_code][6].values[0]

In [16]:
train_fe["most_freq_region"] = [country_code_dict[x] for x in train_fe.most_freq_country.values]
test_fe["most_freq_region"] = [country_code_dict[x] for x in test_fe.most_freq_country.values]

In [17]:
region_list = []
for region in set(train_fe.most_freq_region.values):
    if list(train_fe.most_freq_region.values).count(region) > 20000:
        print(region, list(train_fe.most_freq_region.values).count(region))
        region_list.append(region)

Southern Asia 57538
Sub-Saharan Africa 23872
Western Europe 21095
Latin America and the Caribbean 24213
Australia and New Zealand 20900
Northern America 297832
South-eastern Asia 55423
Northern Europe 60393


In [18]:
for region in region_list:
    col_name = f"is_{region.replace(' ','_').lower()}"
    train_fe[col_name] = [1 if x == region else 0 for x in train_fe.most_freq_region.values]
    test_fe[col_name] = [1 if x == region else 0 for x in test_fe.most_freq_region.values]

In [19]:
train_fe.head(1)

Unnamed: 0,user_id_hash,num_sessions,num_events,median_num_events,num_purchases,amt_purchases,median_session_duration,date_user_created,most_freq_country,most_freq_os,...,most_freq_os_ios,most_freq_region,is_southern_asia,is_sub-saharan_africa,is_western_europe,is_latin_america_and_the_caribbean,is_australia_and_new_zealand,is_northern_america,is_south-eastern_asia,is_northern_europe
0,0d261313961125c71f330a8a8d59857bc48fefd0b4278c...,1,35,35.0,0,0.0,1626343.0,2018-10-13 07:33:07,ID,iOS,...,1,South-eastern Asia,0,0,0,0,0,0,1,0


### Create Labels

In [20]:
train_fe = pd.merge(train_fe, label_1_df, how = 'left').fillna(0)
train_fe = pd.merge(train_fe, label_2_df, how = 'left').fillna(0)

## Machine Learning

In [21]:
train_ml = train_fe.copy()
test_ml = test_fe.copy()

In [22]:
# Check if the labels are correct. The expected outcome is {0.0}.
set(train_ml[train_ml.user_purchase_binary_7_days != train_ml.user_purchase_binary_14_days]\
 .user_purchase_binary_7_days)

{0.0}

In [23]:
train_ml.columns

Index(['user_id_hash', 'num_sessions', 'num_events', 'median_num_events',
       'num_purchases', 'amt_purchases', 'median_session_duration',
       'date_user_created', 'most_freq_country', 'most_freq_os',
       'num_uniq_device_id', 'num_sessions_7', 'num_events_7',
       'median_num_events_7', 'num_purchases_7', 'amt_purchases_7',
       'median_session_duration_7', 'num_sessions_14', 'num_events_14',
       'median_num_events_14', 'num_purchases_14', 'amt_purchases_14',
       'median_session_duration_14', 'num_days_existed',
       'days_since_last_session', 'most_freq_os_android', 'most_freq_os_ios',
       'most_freq_region', 'is_southern_asia', 'is_sub-saharan_africa',
       'is_western_europe', 'is_latin_america_and_the_caribbean',
       'is_australia_and_new_zealand', 'is_northern_america',
       'is_south-eastern_asia', 'is_northern_europe',
       'user_purchase_binary_7_days', 'user_purchase_binary_14_days'],
      dtype='object')

In [24]:
# train_ml["is_purchase_7"] = [1.0 if y > 0 else 0.0 for y in train_ml.num_purchases_7]
# train_ml["is_purchase_ever"] = [1.0 if y > 0 else 0.0 for y in train_ml.num_purchases]
# train_ml["is_session_7"] = [1.0 if y > 0 else 0.0 for y in train_ml.num_sessions_7]
# train_ml["is_purchase_14"] = [1.0 if y > 0 else 0.0 for y in train_ml.num_purchases_14]
# train_ml["is_session_14"] = [1.0 if y > 0 else 0.0 for y in train_ml.num_sessions_14]
# train_ml['purchase_per_day'] = train_ml.num_purchases/train_ml.num_days_existed
# test_ml["is_purchase_7"] = [1.0 if y > 0 else 0.0 for y in test_ml.num_purchases_7]
# test_ml["is_purchase_ever"] = [1.0 if y > 0 else 0.0 for y in test_ml.num_purchases]
# test_ml["is_session_7"] = [1.0 if y > 0 else 0.0 for y in test_ml.num_sessions_7]
# test_ml["is_purchase_14"] = [1.0 if y > 0 else 0.0 for y in test_ml.num_purchases_14]
# test_ml["is_session_14"] = [1.0 if y > 0 else 0.0 for y in test_ml.num_sessions_14]
# test_ml['purchase_per_day'] = test_ml.num_purchases/test_ml.num_days_existed

In [25]:
# # per num_days_existed
# train_ml["num_uniq_session_db_num_days_existed"] = train_ml.num_sessions / train_ml.num_days_existed
# test_ml["num_uniq_session_db_num_days_existed"] = test_ml.num_sessions / test_ml.num_days_existed
# train_ml["total_purchase_amt_db_num_days_existed"] = train_ml.amt_purchases / train_ml.num_days_existed
# test_ml["total_purchase_amt_db_num_days_existed"] = test_ml.amt_purchases / test_ml.num_days_existed
# # per num_uniq_session_id
# train_ml["total_purchase_amt_db_num_uniq_session_id"] = train_ml.amt_purchases / train_ml.num_sessions
# test_ml["total_purchase_amt_db_num_uniq_session_id"] = test_ml.amt_purchases / test_ml.num_sessions
# train_ml["total_purchase_amt_db_total_num_purchase"] = train_ml.amt_purchases / train_ml.num_purchases
# test_ml["total_purchase_amt_db_total_num_purchase"] = test_ml.amt_purchases / test_ml.num_purchases
# train_ml = train_ml.fillna(0)
# test_ml = test_ml.fillna(0)

In [26]:
# train_ml["amt_purchase_past_wk_per_purchase"] = train_ml.amt_purchases_7 / train_ml.num_purchases_7
# test_ml["amt_purchase_past_wk_per_purchase"] = test_ml.amt_purchases_7 / test_ml.num_purchases_7
# train_ml["amt_purchase_past_wk_per_session"] = train_ml.amt_purchases_7 / train_ml.num_sessions_7
# test_ml["amt_purchase_past_wk_per_session"] = test_ml.amt_purchases_7 / test_ml.num_sessions_7
# train_ml = train_ml.fillna(0)
# test_ml = test_ml.fillna(0)

In [60]:
# Set labels
labels_1 = train_ml.user_purchase_binary_7_days
labels_2 = train_ml.user_purchase_binary_14_days
features_train = train_ml[['num_purchases','days_since_last_session']]
features_test = test_ml[['num_purchases','days_since_last_session']]

In [61]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(features_train.values, i) for i in range(features_train.shape[1])]
vif["features"] = features_train.columns
vif.sort_values(by = 'VIF_Factor', ascending=False)

Unnamed: 0,VIF_Factor,features
0,1.002986,num_purchases
1,1.002986,days_since_last_session


In [29]:
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

## Label 1

In [62]:
# Resampling
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
rus = RandomUnderSampler()
features_res_1, labels_res_1 = rus.fit_sample(features_train, labels_1)
print(f"Resampled dataset shape: {Counter(labels_res_1)}")

Resampled dataset shape: Counter({0.0: 4396, 1.0: 4396})


In [63]:
from sklearn.model_selection import train_test_split
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(features_res_1, labels_res_1, test_size=0.3)

In [64]:
from sklearn.linear_model import LogisticRegression
lr_1 = LogisticRegression(class_weight = "balanced")
lr_1.fit(X_train_1, y_train_1)
lr_1_pred = lr_1.predict_proba(X_val_1)[:,1]
print(f"ROC AUC for lr model 1: {roc_auc_score(y_val_1, lr_1_pred):.4f}")
print(f"Log Loss for lr model 1: {log_loss(y_val_1, lr_1_pred):.4f}")

ROC AUC for lr model 1: 0.9583
Log Loss for lr model 1: 0.2986




In [65]:
from sklearn.naive_bayes import BernoulliNB
nb_1 = BernoulliNB(fit_prior=True)
nb_1.fit(X_train_1, y_train_1)
nb_1_pred = nb_1.predict_proba(X_val_1)[:,1]
print(f"ROC AUC for nb model 1: {roc_auc_score(y_val_1, nb_1_pred):.4f}")
print(f"Log Loss for nb model 1: {log_loss(y_val_1, nb_1_pred):.4f}")

ROC AUC for nb model 1: 0.9276
Log Loss for nb model 1: 0.2874


In [72]:
from sklearn.ensemble import RandomForestClassifier
rf_1 = RandomForestClassifier()
rf_1.fit(X_train_1, y_train_1)
rf_1_pred = rf_1.predict_proba(X_val_1)[:,1]
print(f"ROC AUC for rf model 1: {roc_auc_score(y_val_1, rf_1_pred):.4f}")
print(f"Log Loss for rf model 1: {log_loss(y_val_1, rf_1_pred):.4f}")

ROC AUC for rf model 1: 0.9592
Log Loss for rf model 1: 0.4845




In [66]:
import xgboost as xgb
model_1 = xgb.XGBClassifier(
 learning_rate =0.001,
 n_estimators=9,
 max_depth=6,
 min_child_weight=1,
 subsample=0.3,
 colsample_bytree=0.5,
 objective= 'binary:logistic')

xgb_1 = model_1.fit(X_train_1, y_train_1)
xgb_1_pred = xgb_1.predict_proba(X_val_1)[:,1]
print(f"ROC AUC for xgb model 1: {roc_auc_score(y_val_1, xgb_1_pred):.4f}")
print(f"Log Loss for xgb model 1: {log_loss(y_val_1, xgb_1_pred):.4f}")

ROC AUC for xgb model 1: 0.9647
Log Loss for xgb model 1: 0.6882


## Label 2

In [73]:
# Resampling
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
rus = RandomUnderSampler()
features_res_2, labels_res_2 = rus.fit_sample(features_train, labels_2)
print(f"Resampled dataset shape: {Counter(labels_res_2)}")

Resampled dataset shape: Counter({0.0: 5553, 1.0: 5553})


In [74]:
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(features_res_2, labels_res_2, test_size=0.3)

In [75]:
lr_2 = LogisticRegression(class_weight = "balanced")
lr_2.fit(X_train_2, y_train_2)
lr_2_pred = lr_2.predict_proba(X_val_2)[:,1]
print(f"ROC AUC for lr model 2: {roc_auc_score(y_val_2, lr_2_pred):.4f}")
print(f"Log Loss for lr model 2: {log_loss(y_val_2, lr_2_pred):.4f}")

ROC AUC for lr model 2: 0.9459
Log Loss for lr model 2: 0.3436




In [76]:
nb_2 = BernoulliNB(fit_prior=True)
nb_2.fit(X_train_2, y_train_2)
nb_2_pred = nb_2.predict_proba(X_val_2)[:,1]
print(f"ROC AUC for nb model 2: {roc_auc_score(y_val_2, nb_2_pred):.4f}")
print(f"Log Loss for nb model 2: {log_loss(y_val_2, nb_2_pred):.4f}")

ROC AUC for nb model 2: 0.9090
Log Loss for nb model 2: 0.3239


In [77]:
rf_2 = RandomForestClassifier()
rf_2.fit(X_train_2, y_train_2)
rf_2_pred = rf_2.predict_proba(X_val_2)[:,1]
print(f"ROC AUC for rf model 2: {roc_auc_score(y_val_2, rf_2_pred):.4f}")
print(f"Log Loss for rf model 2: {log_loss(y_val_2, rf_2_pred):.4f}")

ROC AUC for rf model 2: 0.9449
Log Loss for rf model 2: 0.5674




In [86]:
import xgboost as xgb
model_2 = xgb.XGBClassifier(
 learning_rate =0.001,
 n_estimators=9,
 max_depth=6,
 min_child_weight=1,
 subsample=0.3,
 colsample_bytree=0.5,
 objective= 'binary:logistic')

xgb_2 = model_2.fit(X_train_2, y_train_2)
xgb_2_pred = xgb_1.predict_proba(X_val_2)[:,1]
print(f"ROC AUC for xgb model 2: {roc_auc_score(y_val_2, xgb_2_pred):.4f}")
print(f"Log Loss for xgb model 2: {log_loss(y_val_2, xgb_2_pred):.4f}")

ROC AUC for xgb model 2: 0.9554
Log Loss for xgb model 2: 0.6883
