In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import sys
import random
import gc
import subprocess
from pprint import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
plt.style.use('./stylelib/custom.mplstyle')

%reload_ext autoreload
%autoreload 2
import preprocessing
import plotter

---

## 1 Data inspection and loading
### 1.1 Downsample training data
__Data size considerations__: The raw training data from kaggle has close to 200 million lines and takes 7 GB of memory. To keep EDA and evaluation steps of machine learning algorithms lightweight, a randomly sampled subset (0.5%) is used for EDA and model evaluation.

In [3]:
%%time

dir_data = './data'
from_scratch = False

# When from_scratch = True, generate randomly sampled subset
csv_train_raw = os.path.join(dir_data, 'train.csv')
csv_train = os.path.join(dir_data, 'train_sample.csv')
if from_scratch:
    nlines_raw, nlines_reduced = preprocessing.csv_randomized_downsamp(
        csv_in=csv_train_raw, csv_out=csv_train, fraction=0.005
    )

CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 14.8 µs


### 1.2 Load data into dataframe
#### (1) Field inspections
For efficiency concerns, we use shell commands instead of pandas operations.

In [4]:
# Quick check of training data fields by calling system shell command
!head -2 ./data/train_sample.csv

ip,app,device,os,channel,click_time,attributed_time,is_attributed
106284,15,1,41,277,2017-11-06 22:57:46,,0


In [5]:
# Quick check of test data fields by calling system shell command
!head -2 ./data/test.csv

click_id,ip,app,device,os,channel,click_time
0,5744,9,1,3,107,2017-11-10 04:00:00


**[Notes] Field selections** <br>
Field inspection tells us that training and testing data don't share the same fields. 
To prepare data for subsequent processing, we only preserve fields that are shared by both training and testing data. One exception is the `is_attributed` field in training data; It is the prediction target and thus needs to be preserved.

In [6]:
# Extract field names from training and testing data
fields_train = preprocessing.csv_list_fields(csv_in=csv_train)
csv_test = os.path.join(dir_data, 'test.csv')
fields_test = preprocessing.csv_list_fields(csv_in=csv_test)

# Extract fields shared by both training and testing data
fields_keep = list(set(fields_train) & (set(fields_test)))
print('Data fields shared by both training and testing data:\n', fields_keep)

Data fields shared by both training and testing data:
 ['click_time', 'app', 'ip', 'channel', 'device', 'os']


#### (2) Load into dataframe

In [7]:
# Load randomly sampled data subset into pandas dataframe,
# and sort by click time (and reset index)
df = pd.read_csv(
    csv_train, usecols=fields_keep + ['is_attributed'],
    parse_dates=['click_time']
).sort_values(by='click_time').reset_index(drop=True)

# Convert click time from UTC to local time
df['click_time'] = (
    pd.DatetimeIndex(df['click_time']).tz_localize('utc')
    .tz_convert('Asia/Shanghai')
)

In [8]:
# Inspect first few lines
df.head(3)

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,86946,3,1,19,379,2017-11-06 23:46:14+08:00,0
1,119349,3,1,17,379,2017-11-06 23:57:47+08:00,0
2,73516,18,1,22,107,2017-11-07 00:00:00+08:00,0


---
## 2. Exploratory data analysis
### 2.1 Examine class proportion

In [9]:
# Inspect data size
nclick_total =len(df)
percentage_pos = (df['is_attributed'].sum()) / nclick_total * 100
print('Percentage of positive target = {:.3f}%'.format(percentage_pos))

Percentage of positive target = 0.251%


**[CAUTION] Class imbalance**<br>
Given that only __<span class="mark">~0.25%</span>__ of the records has positive target values (`df['is_attributed'] == 1`), we have **an extreme case of class imbalance** at hand.

### 2.2 Convert datetime variable to usable form
Among the datetime fields such as month, day, and hour-of-the-day, we only keep hour-of-day and name it as `click_hour` for subsequent processing.

In [10]:
# Extract hour of the day
df['click_hour'] = df['click_time'].dt.hour

# Drop raw `click_time`
df.drop(columns=['click_time'], inplace=True)

In [11]:
df.head(3)

Unnamed: 0,ip,app,device,os,channel,is_attributed,click_hour
0,86946,3,1,19,379,0,23
1,119349,3,1,17,379,0,23
2,73516,18,1,22,107,0,0


### 2.3 Train-test split, and set aside testing data

In [12]:
# Split the original dataframe into in-sample training and testing sets
# Because stratified sampling is the default option of sklearn, it is not
# explicitly set
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

# Double check class ratios after train-test split
print(
    'Training data (pos%):',
    100 * df_train['is_attributed'].sum() / len(df_train)
)
print(
    'Testing data (pos%):',
    100 * df_test['is_attributed'].sum() / len(df_test)
)

Training data (pos%): 0.24939621084641736
Testing data (pos%): 0.25346486104501076


### 2.4 Inspect variable distributions of training data

In [13]:
# Count the amount of unique values
df_counts = pd.DataFrame()
df_counts['n_unique'] = df_train.nunique()
df_counts['n_unique (%)'] = 100 * df_counts['n_unique'] / len(df_train)
df_counts.T

Unnamed: 0,ip,app,device,os,channel,is_attributed,click_hour
n_unique,71528.0,264.0,338.0,182.0,168.0,2.0,24.0
n_unique (%),11.052548,0.040793,0.052228,0.028123,0.025959,0.000309,0.003708


**[TAKE AWAY]**<br>
* Without exception, the total number of unique values for each of these variables is markedly smaller than the total number of clicks. This indicates that many-to-one mapping is typical between clicks and attributes such as `ip`, `app`, and `device`. This is reasonable given that a single user can generate multiple clicks.
* The categorical features at hand are of very high cardinality. Feature engineering is going to be critical in preparing the data for machine learning.

### 2.5 Apply target-guided encoding to categorical features

In [14]:
feature_list = [x for x in df_train.columns if x != 'is_attributed']
impute_rare = True
if impute_rare:
    df_train, df_test = preprocessing.df_rarelabel_imputer(
        df_train, df_test, cols=feature_list, thresh_percentage=0.02
    )
df_train, df_test = preprocessing.df_label2num_encoding(
    df_train, df_test, cols=feature_list
)

### 2.6 Check dataframes after encoding

Note that mapping used for encoding is generated with training data and then propagated to testing data. 
Because there are non-overlap variables between training and testing data, and encoded testing data are going to have missing values. Let's have a look:

In [15]:
# Check presence of missing value after encoding
df_nulls = pd.DataFrame()
df_nulls['nan_train(%)'] = 100 * (df_train.isnull().sum() / len(df_train))
df_nulls['nan_test(%)'] = 100 * (df_test.isnull().sum() / len(df_test))
df_nulls.T

Unnamed: 0,is_attributed,risk_ip,count_ip,risk_app,count_app,risk_device,count_device,risk_os,count_os,risk_channel,count_channel,risk_click_hour,count_click_hour
nan_train(%),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nan_test(%),0.0,3.364989,3.364989,0.006129,0.006129,0.029565,0.029565,0.004327,0.004327,0.000361,0.000361,0.0,0.0


---
## 3. Machine learning
### 3.1 Data preparation

In [16]:
target_col = 'is_attributed'
feature_cols = ([
    x for x in df_train.columns if x != target_col and 'count' not in x
])
#feature_cols = [x for x in df_train.columns if x != target_col]
X_train, y_train = preprocessing.df_to_Xy(
    df_train, target_col=target_col, feature_cols=feature_cols
)

# from imblearn.over_sampling import RandomOverSampler 
# X_train, y_train = RandomOverSampler(random_state=42).fit_sample(X_train, y_train)
X_test, y_test = preprocessing.df_to_Xy(
    df_test, target_col=target_col, feature_cols=feature_cols
)

### 3.2 Model evaluation

In [17]:
import modeling
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

In [18]:
estimator = LogisticRegression(C=100, class_weight='balanced')
model = modeling.Classifier(gridsearch=False)
model.fit(estimator, X_train, y_train, cv=None)
print(model.train_score)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(auc)

fit runtime = 2.60 s
0.9047864685717119
0.900197292818336


In [19]:
estimator = RandomForestClassifier(
    class_weight='balanced', max_depth=2, n_estimators=100,
    random_state=42
)
model = modeling.Classifier(gridsearch=False)
model.fit(estimator, X_train, y_train, cv=None)
print(model.train_score)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(auc)

fit runtime = 38.58 s
0.9067677279338981
0.9084174387150357


In [20]:
# xgb scikit-learn API
estimator = xgb.XGBClassifier(
    scale_pos_weight=400.0, max_depth=2, n_estimators=100,
    random_state=42
)
model = modeling.Classifier(gridsearch=False)
model.fit(estimator, X_train, y_train, cv=None)
print(model.train_score)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(auc)

  if diff:


fit runtime = 33.71 s
0.6470670700442569
0.4452606664179132


  if diff:


In [21]:
# lightgbm scikit-learn API
estimator = lgb.LGBMClassifier(
    scale_pos_weight=400.0, max_depth=2, n_estimators=100,
    random_state=42
)
model = modeling.Classifier(gridsearch=False)
model.fit(estimator, X_train, y_train, cv=None)
print(model.train_score)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(auc)

  if diff:


fit runtime = 8.70 s
0.6458223930329069
0.5884464066064159


  if diff:
