In [3]:
import os
import pandas as pd
import numpy as np
import s3fs
from datetime import datetime
from datetime import timedelta
import calendar
import boto3
import io
import ast

## Setup s3

In [2]:
# aws
aws_filename = "my_aws.txt"
with open (aws_filename, 'r') as f:
    aws_id_key = f.read()
access_id, access_key = aws_id_key.replace("\n","").split(',')

In [3]:
s3 = boto3.client('s3',
                  aws_access_key_id=access_id,
                  aws_secret_access_key=access_key)

## Read train data

In [25]:
obj_train = s3.get_object(Bucket='msds630-kaggle-competition', Key='feature_engineered_train.csv/part-00000')
raw_train = pd.read_csv(obj_train['Body'], header = None)

In [68]:
raw_train.head() # Check the head

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,['eb37a43eb25770c442b54a4943c7f9eb17278328dd04...,9,3560204.0,'2018-11-15 09:53:20','IN','Android OS',1,84.0,0.0,0.0,-1.0,-1.0]
1,['e2c1b7686b247d8ce10d9afbde701ae2a9203731edf8...,1,0.0,'2018-11-25 17:50:00','US','Android OS',1,4.0,0.0,0.0,-1.0,-1.0]
2,['feea441cbaf2df60e772241695d5809cf4bc8f516c34...,1,0.0,'2018-11-14 23:01:40','PH','Android OS',1,15.0,0.0,0.0,-1.0,-1.0]
3,['bf23f836143cc7fb00ec1e6c570e211c601628ad5680...,1,0.0,'2018-11-18 16:33:20','US','Android OS',1,36.0,0.0,0.0,-1.0,-1.0]
4,['b18aa61b3cec4bd7168de7e5037561dc44ec3051c461...,1,0.0,'2018-11-08 17:26:40','US','Android OS',1,9.0,0.0,0.0,-1.0,-1.0]


In [157]:
# Clean the data
raw_train_adj = pd.DataFrame()
raw_train_adj["user_hash_id"] = [ast.literal_eval(elem + "]")[0] for elem in raw_train[0].values]
raw_train_adj["num_uniq_session_id"] = raw_train[1].values
raw_train_adj["mean_prev_session"] = raw_train[2].values
raw_train_adj["date_user_created"] = [elem.replace("\'", "").strip() for elem in raw_train[3].values]
raw_train_adj["most_freq_country"] = [elem.replace("\'", "").strip() for elem in raw_train[4].values]
raw_train_adj["most_freq_os"] = [elem.replace("\'", "").strip() for elem in raw_train[5].values]
raw_train_adj["num_uniq_device_id"] = raw_train[6].values
raw_train_adj["max_num_events_a_session"] = raw_train[7].values
raw_train_adj["total_num_purchase"] = raw_train[8].values
raw_train_adj["total_purchase_amt"] = raw_train[9].values
raw_train_adj["label_1"] = raw_train[10].values
raw_train_adj["label_2"] = [ast.literal_eval("[" + elem)[0] for elem in raw_train[11].values]

In [158]:
raw_train_adj.head() # Check the head

Unnamed: 0,user_hash_id,num_uniq_session_id,mean_prev_session,date_user_created,most_freq_country,most_freq_os,num_uniq_device_id,max_num_events_a_session,total_num_purchase,total_purchase_amt,label_1,label_2
0,eb37a43eb25770c442b54a4943c7f9eb17278328dd0499...,9,3560204.0,2018-11-15 09:53:20,IN,Android OS,1,84.0,0.0,0.0,-1.0,-1.0
1,e2c1b7686b247d8ce10d9afbde701ae2a9203731edf81a...,1,0.0,2018-11-25 17:50:00,US,Android OS,1,4.0,0.0,0.0,-1.0,-1.0
2,feea441cbaf2df60e772241695d5809cf4bc8f516c347c...,1,0.0,2018-11-14 23:01:40,PH,Android OS,1,15.0,0.0,0.0,-1.0,-1.0
3,bf23f836143cc7fb00ec1e6c570e211c601628ad5680eb...,1,0.0,2018-11-18 16:33:20,US,Android OS,1,36.0,0.0,0.0,-1.0,-1.0
4,b18aa61b3cec4bd7168de7e5037561dc44ec3051c461b2...,1,0.0,2018-11-08 17:26:40,US,Android OS,1,9.0,0.0,0.0,-1.0,-1.0


## Read test data

In [18]:
obj_test = s3.get_object(Bucket='msds630-kaggle-competition', Key='feature_engineered_test.csv/part-00000')
raw_test = pd.read_csv(obj_test['Body'], header = None)

In [19]:
raw_test.head() # Check the head

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,['d6cf11a892144d6d2b5d953a5dba12033b2041655aaf...,46,33233950.0,'2018-10-06 05:05:00','US','iOS',1,63.0,12.0,35.61600000000001]
1,['2b0dae5cbb692617108ec258eee1f996a61beec0dc87...,1,0.0,'2018-10-13 19:46:40','AT','iOS',1,2.0,0.0,0.0]
2,['91dc64176a5e7dde98de44444c53cadc491c0d576332...,1,0.0,'2018-10-29 16:05:00','CV','iOS',1,13.0,0.0,0.0]
3,['5c07a4b87ac4e07894cc50667c01ae688a9e30e5e214...,6,3968576.0,'2018-11-01 17:38:20','BS','Android OS',1,42.0,0.0,0.0]
4,['a0701427d30f102e4187a9abaacf381f5a46a1dd964f...,4,2468833.0,'2018-11-19 23:31:40','US','iOS',1,50.0,0.0,0.0]


In [159]:
# Clean the data
raw_test_adj = pd.DataFrame()
raw_test_adj["user_hash_id"] = [ast.literal_eval(elem + "]")[0] for elem in raw_test[0].values]
raw_test_adj["num_uniq_session_id"] = raw_test[1].values
raw_test_adj["mean_prev_session"] = raw_test[2].values
raw_test_adj["date_user_created"] = [elem.replace("\'", "").strip() for elem in raw_test[3].values]
raw_test_adj["most_freq_country"] = [elem.replace("\'", "").strip() for elem in raw_test[4].values]
raw_test_adj["most_freq_os"] = [elem.replace("\'", "").strip() for elem in raw_test[5].values]
raw_test_adj["num_uniq_device_id"] = raw_test[6].values
raw_test_adj["max_num_events_a_session"] = raw_test[7].values
raw_test_adj["total_num_purchase"] = raw_test[8].values
raw_test_adj["total_purchase_amt"] = [ast.literal_eval("[" + elem)[0] for elem in raw_test[9].values]

In [160]:
raw_test_adj.head()

Unnamed: 0,user_hash_id,num_uniq_session_id,mean_prev_session,date_user_created,most_freq_country,most_freq_os,num_uniq_device_id,max_num_events_a_session,total_num_purchase,total_purchase_amt
0,d6cf11a892144d6d2b5d953a5dba12033b2041655aaf26...,46,33233950.0,2018-10-06 05:05:00,US,iOS,1,63.0,12.0,35.616
1,2b0dae5cbb692617108ec258eee1f996a61beec0dc8711...,1,0.0,2018-10-13 19:46:40,AT,iOS,1,2.0,0.0,0.0
2,91dc64176a5e7dde98de44444c53cadc491c0d57633288...,1,0.0,2018-10-29 16:05:00,CV,iOS,1,13.0,0.0,0.0
3,5c07a4b87ac4e07894cc50667c01ae688a9e30e5e2147a...,6,3968576.0,2018-11-01 17:38:20,BS,Android OS,1,42.0,0.0,0.0
4,a0701427d30f102e4187a9abaacf381f5a46a1dd964ffe...,4,2468833.0,2018-11-19 23:31:40,US,iOS,1,50.0,0.0,0.0


In [161]:
# read test_sample
# Make sure you have sample_submission_2.csv in your local path
sample_sub_df = pd.read_csv("sample_submission_2.csv")
sample_sub_df.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01,0.02
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.01,0.02


In [163]:
# Check how many users in sample submission
print(f"Number of users in sample submission: {len(sample_sub_df)}.")
print(f"Number of users in test dataset: {len(raw_test_adj)}.")

Number of users in sample submission: 312568.
Number of users in test dataset: 619520.


In [164]:
users_in_test_n_sample_sub = list(set(raw_test_adj.user_hash_id.values) - 
                                  (set(raw_test_adj.user_hash_id.values) - set(sample_sub_df.user_id_hash.values)))
print(f"Number of users in intersection: {len(users_in_test_n_sample_sub)}.")

Number of users in intersection: 311565.


In [165]:
users_in_sample_sub_not_in_test = list(set(sample_sub_df.user_id_hash.values) - set(raw_test_adj.user_hash_id.values))
print(f"Number of users in sample_sub not in test: {len(users_in_sample_sub_not_in_test)}.")

Number of users in sample_sub not in test: 1003.


In [166]:
# Impute zero for users that don't show up in the sample_sub
for user in users_in_sample_sub_not_in_test:
    sample_sub_df.loc[sample_sub_df.user_id_hash == user, 'user_purchase_binary_7_days'] = 0
    sample_sub_df.loc[sample_sub_df.user_id_hash == user, 'user_purchase_binary_14_days'] = 0

In [187]:
# Generate the test dataset only using id's from sample_sub
desired_idx_list = []
id_list_in_test_dict = {}
for i, user in enumerate(list(raw_test_adj.user_hash_id.values)):
    id_list_in_test_dict[user] = i
for user in users_in_test_n_sample_sub:
    desired_idx_list.append(id_list_in_test_dict[user])
len(desired_idx_list)

311565

In [190]:
raw_test_adj2 = raw_test_adj.loc[desired_idx_list,]
len(raw_test_adj2)

311565

In [196]:
# Test if the intersection between raw_test_adj2 and sample_sub_df is 311565
len(set(raw_test_adj2.user_hash_id.values) - 
    (set(raw_test_adj2.user_hash_id.values) - set(sample_sub_df.user_id_hash.values)))

311565

In [199]:
# Save df's locally
raw_train_adj.to_csv('data/train.csv')
raw_test_adj2.to_csv('data/test.csv')
sample_sub_df.to_csv('data/sample_submission_2_adj.csv')

## Feature Engineering

In [146]:
sample_sub_df = pd.read_csv('data/sample_submission_2_adj.csv').drop(["Unnamed: 0"], axis = 1)
train_fe = pd.read_csv('data/train.csv').drop(["Unnamed: 0"], axis = 1)
test_fe = pd.read_csv('data/test.csv').drop(["Unnamed: 0"], axis = 1)

### Create labels

In [147]:
# Convert -1.0 to 0.0 in labels
train_fe["label_1"] = [0.0 if x == -1.0 else x for x in train_fe.label_1.values]
train_fe["label_2"] = [0.0 if x == -1.0 else x for x in train_fe.label_2.values]

### Create column for Number of Days Existed

In [148]:
# Create column for Number of Days Existed
# For training, compute days between first date and Dec.2
# For testing, compute days between first date and Dec.16
train_fe["num_days_existed"] = [(datetime(2018, 12, 2) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                for date in train_fe.date_user_created.values]
test_fe["num_days_existed"] = [(datetime(2018, 12, 16) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                for date in test_fe.date_user_created.values]

### Create column for Days Since Most Recent Session

In [149]:
most_recent_train_df = pd.read_csv('feature_eng/most_recent_session_train.csv', header = None)\
                         .rename(columns = {0:"user_hash_id", 1:"most_recent_session"})
most_recent_test_df = pd.read_csv('feature_eng/most_recent_session_test.csv', header = None)\
                        .rename(columns = {0:"user_hash_id", 1:"most_recent_session"})

In [150]:
train_fe_adj = pd.merge(train_fe, most_recent_train_df, how = 'left')
test_fe_adj = pd.merge(test_fe, most_recent_test_df, how = 'left')
train_fe_adj2 = train_fe_adj
test_fe_adj2 = test_fe_adj
train_fe_adj2["most_recent_session"] = train_fe_adj["most_recent_session"].fillna(train_fe_adj["date_user_created"])
test_fe_adj2["most_recent_session"] = test_fe_adj["most_recent_session"].fillna(test_fe_adj["date_user_created"])

In [151]:
train_fe["days_since_last_session"] = [(datetime(2018, 12, 2) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                        for date in train_fe_adj2.most_recent_session.values]
test_fe["days_since_last_session"] = [(datetime(2018, 12, 16) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                        for date in test_fe_adj2.most_recent_session.values]

### Create column for frequent OS

In [152]:
train_fe["most_freq_os_android"] = [1 if x == 'Android OS' else 0 for x in train_fe.most_freq_os.values]
train_fe["most_freq_os_ios"] = [1 if x != 'Android OS' else 0 for x in train_fe.most_freq_os.values]
test_fe["most_freq_os_android"] = [1 if x == 'Android OS' else 0 for x in test_fe.most_freq_os.values]
test_fe["most_freq_os_ios"] = [1 if x != 'Android OS' else 0 for x in test_fe.most_freq_os.values]

### Create dummy regarding US

In [153]:
train_fe["is_us"] = [1 if x == 'US' else 0 for x in train_fe.most_freq_country.values]
test_fe["is_us"] = [1 if x == 'US' else 0 for x in test_fe.most_freq_country.values]

### Create dummies regarding regions

In [154]:
country_code_df = pd.read_csv("country_code.csv", header = None)
country_code_df = country_code_df.loc[1:,[1,5,6]]
country_code_df = country_code_df[country_code_df[1] != 'AQ'].dropna()
country_code_dict = {}
country_code_dict['ZZ'] = "Unknown"
country_code_dict['XK'] = "Southern Europe"
country_code_dict['NAM'] = "Sub-Saharan Africa"
for country_code in country_code_df[1].values:
    country_code_dict[country_code] = country_code_df[country_code_df[1]==country_code][6].values[0]

In [155]:
train_fe["most_freq_region"] = [country_code_dict[x] for x in train_fe.most_freq_country.values]
test_fe["most_freq_region"] = [country_code_dict[x] for x in test_fe.most_freq_country.values]

In [156]:
region_list = []
for region in set(train_fe.most_freq_region.values):
    if list(train_fe.most_freq_region.values).count(region) > 20000:
        print(region, list(train_fe.most_freq_region.values).count(region))
        region_list.append(region)

South-eastern Asia 55431
Northern Europe 60404
Latin America and the Caribbean 24189
Australia and New Zealand 20886
Southern Asia 57517
Northern America 298015
Western Europe 21043
Sub-Saharan Africa 23890


In [157]:
for region in region_list:
    col_name = f"is_{region.replace(' ','_').lower()}"
    train_fe[col_name] = [1 if x == region else 0 for x in train_fe.most_freq_region.values]
    test_fe[col_name] = [1 if x == region else 0 for x in test_fe.most_freq_region.values]

In [158]:
train_fe.head()

Unnamed: 0,user_hash_id,num_uniq_session_id,mean_prev_session,date_user_created,most_freq_country,most_freq_os,num_uniq_device_id,max_num_events_a_session,total_num_purchase,total_purchase_amt,...,is_us,most_freq_region,is_south-eastern_asia,is_northern_europe,is_latin_america_and_the_caribbean,is_australia_and_new_zealand,is_southern_asia,is_northern_america,is_western_europe,is_sub-saharan_africa
0,eb37a43eb25770c442b54a4943c7f9eb17278328dd0499...,9,3560204.0,2018-11-15 09:53:20,IN,Android OS,1,84.0,0.0,0.0,...,0,Southern Asia,0,0,0,0,1,0,0,0
1,e2c1b7686b247d8ce10d9afbde701ae2a9203731edf81a...,1,0.0,2018-11-25 17:50:00,US,Android OS,1,4.0,0.0,0.0,...,1,Northern America,0,0,0,0,0,1,0,0
2,feea441cbaf2df60e772241695d5809cf4bc8f516c347c...,1,0.0,2018-11-14 23:01:40,PH,Android OS,1,15.0,0.0,0.0,...,0,South-eastern Asia,1,0,0,0,0,0,0,0
3,bf23f836143cc7fb00ec1e6c570e211c601628ad5680eb...,1,0.0,2018-11-18 16:33:20,US,Android OS,1,36.0,0.0,0.0,...,1,Northern America,0,0,0,0,0,1,0,0
4,b18aa61b3cec4bd7168de7e5037561dc44ec3051c461b2...,1,0.0,2018-11-08 17:26:40,US,Android OS,1,9.0,0.0,0.0,...,1,Northern America,0,0,0,0,0,1,0,0


## Modeling

In [159]:
train_ml = train_fe.copy()
test_ml = test_fe.copy()

In [160]:
train_ml.columns

Index(['user_hash_id', 'num_uniq_session_id', 'mean_prev_session',
       'date_user_created', 'most_freq_country', 'most_freq_os',
       'num_uniq_device_id', 'max_num_events_a_session', 'total_num_purchase',
       'total_purchase_amt', 'label_1', 'label_2', 'num_days_existed',
       'days_since_last_session', 'most_freq_os_android', 'most_freq_os_ios',
       'is_us', 'most_freq_region', 'is_south-eastern_asia',
       'is_northern_europe', 'is_latin_america_and_the_caribbean',
       'is_australia_and_new_zealand', 'is_southern_asia',
       'is_northern_america', 'is_western_europe', 'is_sub-saharan_africa'],
      dtype='object')

In [161]:
# Set labels
labels_1 = train_ml.label_1
labels_2 = train_ml.label_2

In [162]:
# Set features
features_train = train_ml.drop(["user_hash_id", "mean_prev_session", 
                                "most_freq_region", 
                                "date_user_created", "most_freq_country", "most_freq_os", 
                                "label_1", "label_2"], axis = 1)
features_test = test_ml.drop(["user_hash_id", "mean_prev_session", 
                              "most_freq_region", 
                              "date_user_created", "most_freq_country", "most_freq_os"], axis = 1)

### Label 1 Model

In [163]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc

In [164]:
# Resampling
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
rus = RandomUnderSampler()
features_res_1, labels_res_1 = rus.fit_sample(features_train, labels_1)
print(f"Resampled dataset shape: {Counter(labels_res_1)}")

Resampled dataset shape: Counter({0.0: 4393, 1.0: 4393})


In [165]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
def build_pipe(model):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])

In [166]:
from sklearn.model_selection import train_test_split
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(features_res_1, labels_res_1, test_size=0.3)

In [167]:
from sklearn.linear_model import LogisticRegression
lr_1 = LogisticRegression(penalty='l2', class_weight = "balanced")
plr_1 = build_pipe(lr_1)
plr_1.fit(X_train_1, y_train_1)
plr_1_acc = plr_1.score(X_val_1, y_val_1)
print(f"Accuracy for lr model 1: {plr_1_acc:.4f}")
print(f"Precision for lr model 1: {precision_score(plr_1.predict(X_val_1), y_val_1):.4f}")
print(f"Recall for lr model 1: {recall_score(plr_1.predict(X_val_1), y_val_1):.4f}")

Accuracy for lr model 1: 0.9040
Precision for lr model 1: 0.9122
Recall for lr model 1: 0.8949




In [168]:
probs_y_1 = lr_1.predict_proba(X_val_1)
precision_1, recall_1, thresholds_1 = precision_recall_curve(y_val_1, probs_y_1[:, 1]) 
pr_auc_1 = auc(recall_1, precision_1)
new_pred_1 = [1 if x >= pr_auc_1 else 0 for x in probs_y_1[:,1]]
print(f"Accuracy for lr model 1: {accuracy_score(new_pred_1, y_val_1):.4f}")
print(f"Precision for lr model 1: {precision_score(new_pred_1, y_val_1):.4f}")
print(f"Recall for lr model 1: {recall_score(new_pred_1, y_val_1):.4f}")

Accuracy for lr model 1: 0.7986
Precision for lr model 1: 0.9715
Recall for lr model 1: 0.7185


In [169]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.utils.class_weight import compute_sample_weight
nb_1 = BernoulliNB(fit_prior=True)
nb_1.fit(X_train_1, y_train_1, sample_weight=compute_sample_weight(class_weight='balanced', y=y_train_1))
nb_acc_1 = nb_1.score(X_val_1, y_val_1)
print(f"Accuracy for rf model 1: {nb_acc_1:.4f}")
print(f"Precision for rf model 1: {precision_score(nb_1.predict(X_val_1), y_val_1):.4f}")
print(f"Recall for rf model 1: {recall_score(nb_1.predict(X_val_1), y_val_1):.4f}")

Accuracy for rf model 1: 0.8892
Precision for rf model 1: 0.8498
Recall for rf model 1: 0.9192


In [170]:
import xgboost as xgb

model_1 = xgb.XGBClassifier(
 learning_rate =0.001,
 n_estimators=10,
 max_depth=7,
 min_child_weight=1,
 gamma=0,
 subsample=0.5,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [171]:
xgb_1 = model_1.fit(X_train_1, y_train_1)
pred_1 = xgb_1.predict(X_val_1)
print("Accuracy for xgb model 1: %.2f" % (accuracy_score(y_val_1, pred_1)))
print("Precision for xgb model 1: %.2f" % (precision_score(y_val_1, pred_1)))
print("Recall for xgb model 1: %.2f" % (recall_score(y_val_1, pred_1)))

Accuracy for xgb model 1: 0.91
Precision for xgb model 1: 0.91
Recall for xgb model 1: 0.91


### Label 2 Model

In [172]:
features_res_2, labels_res_2 = rus.fit_sample(features_train, labels_2)
print(f"Resampled dataset shape: {Counter(labels_res_2)}")

Resampled dataset shape: Counter({0.0: 5551, 1.0: 5551})


In [173]:
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(features_res_2, labels_res_2, test_size=0.3)

In [174]:
lr_2 = LogisticRegression(penalty='l2', class_weight = "balanced")
lr_2.fit(X_train_2, y_train_2)
lr_2_acc = lr_2.score(X_val_2, y_val_2)
print(f"Accuracy for lr model 2: {lr_2_acc:.4f}")
print(f"Precision for lr model 2: {precision_score(lr_2.predict(X_val_2), y_val_2):.4f}")
print(f"Recall for lr model 2: {recall_score(lr_2.predict(X_val_2), y_val_2):.4f}")

Accuracy for lr model 2: 0.9012
Precision for lr model 2: 0.8967
Recall for lr model 2: 0.9017




In [175]:
nb_2 = BernoulliNB(fit_prior=True)
nb_2.fit(X_train_2, y_train_2, sample_weight=compute_sample_weight(class_weight='balanced', y=y_train_2))
nb_acc_2 = nb_2.score(X_val_2, y_val_2)
print(f"Accuracy for rf model 2: {nb_acc_2:.4f}")
print(f"Precision for rf model 2: {precision_score(nb_2.predict(X_val_2), y_val_2):.4f}")
print(f"Recall for rf model 2: {recall_score(nb_2.predict(X_val_2), y_val_2):.4f}")

Accuracy for rf model 2: 0.8877
Precision for rf model 2: 0.8362
Recall for rf model 2: 0.9281


In [176]:
model_2 = xgb.XGBClassifier(
 learning_rate =0.001,
 n_estimators=10,
 max_depth=7,
 min_child_weight=1,
 gamma=0,
 subsample=0.5,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [177]:
xgb_2 = model_2.fit(X_train_2, y_train_2)
pred_2 = xgb_2.predict(X_val_2)
print("Accuracy for xgb model 2: %.2f" % (accuracy_score(y_val_2, pred_2)))
print("Precision for xgb model 2: %.2f" % (precision_score(y_val_2, pred_2)))
print("Recall for xgb model 2: %.2f" % (recall_score(y_val_2, pred_2)))

Accuracy for xgb model 2: 0.91
Precision for xgb model 2: 0.92
Recall for xgb model 2: 0.90


### Build result data frame

In [178]:
# For resampling
_, num_cols = features_test.shape
new_cols_name_test = [f'f{i}' for i in range(num_cols)]
old_cols_name_test = list(features_test.columns)
features_test_adj = features_test.copy()
for j in range(num_cols):
    features_test_adj = features_test_adj.rename(columns = {old_cols_name_test[j]:new_cols_name_test[j]})
features_test_adj.columns

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17'],
      dtype='object')

In [179]:
result_df = pd.DataFrame()
result_df["user_id_hash"] = test_ml.user_hash_id.values
result_df['user_purchase_binary_7_days'] = xgb_1.predict(features_test_adj)
result_df['user_purchase_binary_14_days'] = xgb_2.predict(features_test_adj)
result_df.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,bfc380aa034b08be2a4bbd79b95e8752831f60ba210181...,1.0,1.0
1,f4167c71f17aec508691349d5f47d15483912ffc35c129...,0.0,0.0
2,e7b07a39629362c297b52ae037a32b6441ecd412bc0d74...,0.0,0.0
3,b613547f6575992a16318bfef73a4a3f8358459309652a...,0.0,0.0
4,fb7619092ede061b4f7cda0f1158112e7fa8210856e77c...,0.0,0.0


In [180]:
len(result_df)

311565

In [181]:
sample_sub_df_adj = pd.DataFrame()
sample_sub_df_adj["user_id_hash"] = sample_sub_df.user_id_hash.values
len(sample_sub_df_adj)

312568

In [182]:
final_result = pd.merge(sample_sub_df_adj, result_df, how = "left").fillna(0.0)
final_result.isnull().sum() # Check if there is nan

user_id_hash                    0
user_purchase_binary_7_days     0
user_purchase_binary_14_days    0
dtype: int64

In [183]:
final_result.to_csv('submission/sample_submission_fatpapaya_xgb_2.csv', index = False)