In [12]:
import os
import pandas as pd
import numpy as np
import s3fs
from datetime import datetime
from datetime import timedelta
import calendar
import boto3
import io
import ast

## Setup s3

In [2]:
# aws
aws_filename = "my_aws.txt"
with open (aws_filename, 'r') as f:
    aws_id_key = f.read()
access_id, access_key = aws_id_key.replace("\n","").split(',')

In [3]:
s3 = boto3.client('s3',
                  aws_access_key_id=access_id,
                  aws_secret_access_key=access_key)

## Read train data

In [25]:
obj_train = s3.get_object(Bucket='msds630-kaggle-competition', Key='feature_engineered_train.csv/part-00000')
raw_train = pd.read_csv(obj_train['Body'], header = None)

In [68]:
raw_train.head() # Check the head

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,['eb37a43eb25770c442b54a4943c7f9eb17278328dd04...,9,3560204.0,'2018-11-15 09:53:20','IN','Android OS',1,84.0,0.0,0.0,-1.0,-1.0]
1,['e2c1b7686b247d8ce10d9afbde701ae2a9203731edf8...,1,0.0,'2018-11-25 17:50:00','US','Android OS',1,4.0,0.0,0.0,-1.0,-1.0]
2,['feea441cbaf2df60e772241695d5809cf4bc8f516c34...,1,0.0,'2018-11-14 23:01:40','PH','Android OS',1,15.0,0.0,0.0,-1.0,-1.0]
3,['bf23f836143cc7fb00ec1e6c570e211c601628ad5680...,1,0.0,'2018-11-18 16:33:20','US','Android OS',1,36.0,0.0,0.0,-1.0,-1.0]
4,['b18aa61b3cec4bd7168de7e5037561dc44ec3051c461...,1,0.0,'2018-11-08 17:26:40','US','Android OS',1,9.0,0.0,0.0,-1.0,-1.0]


In [157]:
# Clean the data
raw_train_adj = pd.DataFrame()
raw_train_adj["user_hash_id"] = [ast.literal_eval(elem + "]")[0] for elem in raw_train[0].values]
raw_train_adj["num_uniq_session_id"] = raw_train[1].values
raw_train_adj["mean_prev_session"] = raw_train[2].values
raw_train_adj["date_user_created"] = [elem.replace("\'", "").strip() for elem in raw_train[3].values]
raw_train_adj["most_freq_country"] = [elem.replace("\'", "").strip() for elem in raw_train[4].values]
raw_train_adj["most_freq_os"] = [elem.replace("\'", "").strip() for elem in raw_train[5].values]
raw_train_adj["num_uniq_device_id"] = raw_train[6].values
raw_train_adj["max_num_events_a_session"] = raw_train[7].values
raw_train_adj["total_num_purchase"] = raw_train[8].values
raw_train_adj["total_purchase_amt"] = raw_train[9].values
raw_train_adj["label_1"] = raw_train[10].values
raw_train_adj["label_2"] = [ast.literal_eval("[" + elem)[0] for elem in raw_train[11].values]

In [158]:
raw_train_adj.head() # Check the head

Unnamed: 0,user_hash_id,num_uniq_session_id,mean_prev_session,date_user_created,most_freq_country,most_freq_os,num_uniq_device_id,max_num_events_a_session,total_num_purchase,total_purchase_amt,label_1,label_2
0,eb37a43eb25770c442b54a4943c7f9eb17278328dd0499...,9,3560204.0,2018-11-15 09:53:20,IN,Android OS,1,84.0,0.0,0.0,-1.0,-1.0
1,e2c1b7686b247d8ce10d9afbde701ae2a9203731edf81a...,1,0.0,2018-11-25 17:50:00,US,Android OS,1,4.0,0.0,0.0,-1.0,-1.0
2,feea441cbaf2df60e772241695d5809cf4bc8f516c347c...,1,0.0,2018-11-14 23:01:40,PH,Android OS,1,15.0,0.0,0.0,-1.0,-1.0
3,bf23f836143cc7fb00ec1e6c570e211c601628ad5680eb...,1,0.0,2018-11-18 16:33:20,US,Android OS,1,36.0,0.0,0.0,-1.0,-1.0
4,b18aa61b3cec4bd7168de7e5037561dc44ec3051c461b2...,1,0.0,2018-11-08 17:26:40,US,Android OS,1,9.0,0.0,0.0,-1.0,-1.0


## Read test data

In [18]:
obj_test = s3.get_object(Bucket='msds630-kaggle-competition', Key='feature_engineered_test.csv/part-00000')
raw_test = pd.read_csv(obj_test['Body'], header = None)

In [19]:
raw_test.head() # Check the head

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,['d6cf11a892144d6d2b5d953a5dba12033b2041655aaf...,46,33233950.0,'2018-10-06 05:05:00','US','iOS',1,63.0,12.0,35.61600000000001]
1,['2b0dae5cbb692617108ec258eee1f996a61beec0dc87...,1,0.0,'2018-10-13 19:46:40','AT','iOS',1,2.0,0.0,0.0]
2,['91dc64176a5e7dde98de44444c53cadc491c0d576332...,1,0.0,'2018-10-29 16:05:00','CV','iOS',1,13.0,0.0,0.0]
3,['5c07a4b87ac4e07894cc50667c01ae688a9e30e5e214...,6,3968576.0,'2018-11-01 17:38:20','BS','Android OS',1,42.0,0.0,0.0]
4,['a0701427d30f102e4187a9abaacf381f5a46a1dd964f...,4,2468833.0,'2018-11-19 23:31:40','US','iOS',1,50.0,0.0,0.0]


In [159]:
# Clean the data
raw_test_adj = pd.DataFrame()
raw_test_adj["user_hash_id"] = [ast.literal_eval(elem + "]")[0] for elem in raw_test[0].values]
raw_test_adj["num_uniq_session_id"] = raw_test[1].values
raw_test_adj["mean_prev_session"] = raw_test[2].values
raw_test_adj["date_user_created"] = [elem.replace("\'", "").strip() for elem in raw_test[3].values]
raw_test_adj["most_freq_country"] = [elem.replace("\'", "").strip() for elem in raw_test[4].values]
raw_test_adj["most_freq_os"] = [elem.replace("\'", "").strip() for elem in raw_test[5].values]
raw_test_adj["num_uniq_device_id"] = raw_test[6].values
raw_test_adj["max_num_events_a_session"] = raw_test[7].values
raw_test_adj["total_num_purchase"] = raw_test[8].values
raw_test_adj["total_purchase_amt"] = [ast.literal_eval("[" + elem)[0] for elem in raw_test[9].values]

In [160]:
raw_test_adj.head()

Unnamed: 0,user_hash_id,num_uniq_session_id,mean_prev_session,date_user_created,most_freq_country,most_freq_os,num_uniq_device_id,max_num_events_a_session,total_num_purchase,total_purchase_amt
0,d6cf11a892144d6d2b5d953a5dba12033b2041655aaf26...,46,33233950.0,2018-10-06 05:05:00,US,iOS,1,63.0,12.0,35.616
1,2b0dae5cbb692617108ec258eee1f996a61beec0dc8711...,1,0.0,2018-10-13 19:46:40,AT,iOS,1,2.0,0.0,0.0
2,91dc64176a5e7dde98de44444c53cadc491c0d57633288...,1,0.0,2018-10-29 16:05:00,CV,iOS,1,13.0,0.0,0.0
3,5c07a4b87ac4e07894cc50667c01ae688a9e30e5e2147a...,6,3968576.0,2018-11-01 17:38:20,BS,Android OS,1,42.0,0.0,0.0
4,a0701427d30f102e4187a9abaacf381f5a46a1dd964ffe...,4,2468833.0,2018-11-19 23:31:40,US,iOS,1,50.0,0.0,0.0


In [161]:
# read test_sample
# Make sure you have sample_submission_2.csv in your local path
sample_sub_df = pd.read_csv("sample_submission_2.csv")
sample_sub_df.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,e469dfaed039ead9110165d9bc457acb11609ca34057dc...,0.01,0.02
1,afcc639a324b6c598ef83d360450afa011cb2dd1358bf9...,0.01,0.02
2,fd5a7cf211d08e3e00f7be6a9df6e6ea3d2e5c22a5d9c3...,0.01,0.02
3,00bfff98b9d0329f014c2eeac7ce47cd18b2bc6e10d608...,0.01,0.02
4,0d298f3638c43e915c119d4935e1ce8d168f81b5e3e8c1...,0.01,0.02


In [163]:
# Check how many users in sample submission
print(f"Number of users in sample submission: {len(sample_sub_df)}.")
print(f"Number of users in test dataset: {len(raw_test_adj)}.")

Number of users in sample submission: 312568.
Number of users in test dataset: 619520.


In [164]:
users_in_test_n_sample_sub = list(set(raw_test_adj.user_hash_id.values) - 
                                  (set(raw_test_adj.user_hash_id.values) - set(sample_sub_df.user_id_hash.values)))
print(f"Number of users in intersection: {len(users_in_test_n_sample_sub)}.")

Number of users in intersection: 311565.


In [165]:
users_in_sample_sub_not_in_test = list(set(sample_sub_df.user_id_hash.values) - set(raw_test_adj.user_hash_id.values))
print(f"Number of users in sample_sub not in test: {len(users_in_sample_sub_not_in_test)}.")

Number of users in sample_sub not in test: 1003.


In [166]:
# Impute zero for users that don't show up in the sample_sub
for user in users_in_sample_sub_not_in_test:
    sample_sub_df.loc[sample_sub_df.user_id_hash == user, 'user_purchase_binary_7_days'] = 0
    sample_sub_df.loc[sample_sub_df.user_id_hash == user, 'user_purchase_binary_14_days'] = 0

In [187]:
# Generate the test dataset only using id's from sample_sub
desired_idx_list = []
id_list_in_test_dict = {}
for i, user in enumerate(list(raw_test_adj.user_hash_id.values)):
    id_list_in_test_dict[user] = i
for user in users_in_test_n_sample_sub:
    desired_idx_list.append(id_list_in_test_dict[user])
len(desired_idx_list)

311565

In [190]:
raw_test_adj2 = raw_test_adj.loc[desired_idx_list,]
len(raw_test_adj2)

311565

In [196]:
# Test if the intersection between raw_test_adj2 and sample_sub_df is 311565
len(set(raw_test_adj2.user_hash_id.values) - 
    (set(raw_test_adj2.user_hash_id.values) - set(sample_sub_df.user_id_hash.values)))

311565

In [199]:
# Save df's locally
raw_train_adj.to_csv('data/train.csv')
raw_test_adj2.to_csv('data/test.csv')
sample_sub_df.to_csv('data/sample_submission_2_adj.csv')

## Feature Engineering

### Create column for Number of Days Existed

In [192]:
train_fe = raw_train_adj
test_fe = raw_test_adj2

In [193]:
# Create column for Number of Days Existed
# For training, compute days between first date and Dec.2
# For testing, compute days between first date and Dec.16
train_fe["num_days_existed"] = [(datetime(2018, 12, 2) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                for date in train_fe.date_user_created.values]
test_fe["num_days_existed"] = [(datetime(2018, 12, 16) - datetime.strptime(date, '%Y-%m-%d %H:%M:%S')).days 
                                for date in test_fe.date_user_created.values]

In [203]:
# Convert -1.0 to 0.0 in labels
train_fe["label_1"] = [0.0 if x == -1.0 else x for x in train_fe.label_1.values]
train_fe["label_2"] = [0.0 if x == -1.0 else x for x in train_fe.label_2.values]

In [204]:
train_fe.head()

Unnamed: 0,user_hash_id,num_uniq_session_id,mean_prev_session,date_user_created,most_freq_country,most_freq_os,num_uniq_device_id,max_num_events_a_session,total_num_purchase,total_purchase_amt,label_1,label_2,num_days_existed
0,eb37a43eb25770c442b54a4943c7f9eb17278328dd0499...,9,3560204.0,2018-11-15 09:53:20,IN,Android OS,1,84.0,0.0,0.0,0.0,0.0,16
1,e2c1b7686b247d8ce10d9afbde701ae2a9203731edf81a...,1,0.0,2018-11-25 17:50:00,US,Android OS,1,4.0,0.0,0.0,0.0,0.0,6
2,feea441cbaf2df60e772241695d5809cf4bc8f516c347c...,1,0.0,2018-11-14 23:01:40,PH,Android OS,1,15.0,0.0,0.0,0.0,0.0,17
3,bf23f836143cc7fb00ec1e6c570e211c601628ad5680eb...,1,0.0,2018-11-18 16:33:20,US,Android OS,1,36.0,0.0,0.0,0.0,0.0,13
4,b18aa61b3cec4bd7168de7e5037561dc44ec3051c461b2...,1,0.0,2018-11-08 17:26:40,US,Android OS,1,9.0,0.0,0.0,0.0,0.0,23


## Modeling

In [206]:
train_ml = train_fe
test_ml = test_fe

In [207]:
train_ml.columns

Index(['user_hash_id', 'num_uniq_session_id', 'mean_prev_session',
       'date_user_created', 'most_freq_country', 'most_freq_os',
       'num_uniq_device_id', 'max_num_events_a_session', 'total_num_purchase',
       'total_purchase_amt', 'label_1', 'label_2', 'num_days_existed'],
      dtype='object')

In [208]:
# Set labels
labels_1 = train_ml.label_1
labels_2 = train_ml.label_2

In [211]:
# Set features
features_train = train_ml.drop(["user_hash_id", 
                                "date_user_created", "most_freq_country", "most_freq_os", 
                                "label_1", "label_2"], axis = 1)
features_test = test_ml.drop(["user_hash_id", 
                              "date_user_created", "most_freq_country", "most_freq_os"], axis = 1)

### Label 1 Model

In [212]:
from sklearn.model_selection import train_test_split
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(features_train, labels_1, test_size=0.3)

In [230]:
from sklearn.linear_model import LogisticRegression
lr_1 = LogisticRegression(penalty='l2', class_weight = "balanced")
lr_1.fit(X_train_1, y_train_1)
lr_1_acc = lr_1.score(X_val_1, y_val_1)
print(f"{lr_1_acc:.4f}")



0.8446


### Label 2 Model

In [229]:
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(features_train, labels_2, test_size=0.3)

In [231]:
lr_2 = LogisticRegression(penalty='l2', class_weight = "balanced")
lr_2.fit(X_train_2, y_train_2)
lr_2_acc = lr_1.score(X_val_2, y_val_2)
print(f"{lr_2_acc:.4f}")



0.8463


### Build result data frame

In [240]:
result_df = pd.DataFrame()
result_df["user_id_hash"] = test_ml.user_hash_id.values
result_df['user_purchase_binary_7_days'] = lr_1.predict(features_test)
result_df['user_purchase_binary_14_days'] = lr_2.predict(features_test)
result_df.head()

Unnamed: 0,user_id_hash,user_purchase_binary_7_days,user_purchase_binary_14_days
0,bfc380aa034b08be2a4bbd79b95e8752831f60ba210181...,0.0,0.0
1,f4167c71f17aec508691349d5f47d15483912ffc35c129...,0.0,0.0
2,e7b07a39629362c297b52ae037a32b6441ecd412bc0d74...,1.0,1.0
3,b613547f6575992a16318bfef73a4a3f8358459309652a...,0.0,0.0
4,fb7619092ede061b4f7cda0f1158112e7fa8210856e77c...,0.0,0.0


In [254]:
len(result_df)

311565

In [253]:
sample_sub_df_adj = pd.DataFrame()
sample_sub_df_adj["user_id_hash"] = sample_sub_df.user_id_hash.values
len(sample_sub_df_adj)

312568

In [270]:
final_result = pd.merge(sample_sub_df_adj, result_df, how = "left").fillna(0.0)
final_result.isnull().sum() # Check if there is nan

user_id_hash                    0
user_purchase_binary_7_days     0
user_purchase_binary_14_days    0
dtype: int64

In [277]:
final_result.to_csv('submission/sample_submission_fatpapaya_1.csv', index = False)