In [2]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import sys
import os
import random
import gc
import subprocess
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
plt.style.use('./stylelib/custom.mplstyle')

%reload_ext autoreload
%autoreload 2
import preprocessing
import plotter

---

## 1. Data inspection and loading
### 1.1 Downsample training data
Only 0.5% of the all training records is used for exploratory data analysis

In [4]:
%%time

dir_data = './data'
from_scratch = False

csv_train_raw = os.path.join(dir_data, 'train.csv')
csv_train = os.path.join(dir_data, 'train_sample.csv')
if from_scratch:
    nlines_raw, nlines_reduced = preprocessing.csv_randomized_downsamp(
        csv_in=csv_train_raw, csv_out=csv_train, fraction=0.005
    )

CPU times: user 15 µs, sys: 0 ns, total: 15 µs
Wall time: 20.3 µs


### 1.2 Load data into dataframe
#### (1) Field inspections

In [5]:
# Quick check of training data fields by calling system shell command
!head -2 ./data/train_sample.csv

ip,app,device,os,channel,click_time,attributed_time,is_attributed
106284,15,1,41,277,2017-11-06 22:57:46,,0


In [6]:
# Quick check of test data fields by calling system shell command
!head -2 ./data/test.csv

click_id,ip,app,device,os,channel,click_time
0,5744,9,1,3,107,2017-11-10 04:00:00


**[Notes] Field selections** <br>
Note that from above that training and testing data don't share the same fields. Because `attributed_time` is not present in the testing data, it should not be included during training either. 

In [7]:
# Extract field names from training and testing data
fields_train = preprocessing.csv_list_fields(csv_in=csv_train)
csv_test = os.path.join(dir_data, 'test.csv')
fields_test = preprocessing.csv_list_fields(csv_in=csv_test)

# List of fields shared by both training and testing data
fields_use = list(set(fields_train) & (set(fields_test)))
print(fields_use)

['ip', 'click_time', 'device', 'os', 'app', 'channel']


#### (2) Load into dataframe

In [8]:
# Load randomly sampled data subset into pandas dataframe,
# and sort by click time (and reset index)
df = pd.read_csv(
    csv_train, usecols=fields_use + ['is_attributed'], parse_dates=['click_time'],
).sort_values(by='click_time').reset_index(drop=True)

# Convert click time from UTC to local time
df['click_time'] = (
    pd.DatetimeIndex(df['click_time']).tz_localize('utc').tz_convert('Asia/Shanghai')
)

In [9]:
# Inspect first few lines
display(df.head(3))

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,86946,3,1,19,379,2017-11-06 23:46:14+08:00,0
1,119349,3,1,17,379,2017-11-06 23:57:47+08:00,0
2,73516,18,1,22,107,2017-11-07 00:00:00+08:00,0


In [10]:
# Inspect data size
nclick_total =len(df)
percentage_pos = (df['is_attributed'].sum()) / nclick_total * 100
print('Percentage of positive target = {:.3f}%'.format(percentage_pos))

Percentage of positive target = 0.251%


**[Notes] Class imbalance**<br>
Only 0.25% of the records has positive target values (`df['is_attributed'] == 1`). We therefore have an extreme case of class imbalance at hand.

---
## 2. Exploratory data analysis

In [11]:
# Extract hour of the day
df['click_hour'] = df['click_time'].dt.hour
df['click_day'] = df['click_time'].dt.day
df.head(3)

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,click_hour,click_day
0,86946,3,1,19,379,2017-11-06 23:46:14+08:00,0,23,6
1,119349,3,1,17,379,2017-11-06 23:57:47+08:00,0,23,6
2,73516,18,1,22,107,2017-11-07 00:00:00+08:00,0,0,7


In [None]:
pd.DataFrame(df.nunique() / df.shape[0] * 100).T

In [20]:
len(set(df[df.is_attributed == 1]['ip']) & set(df[df.is_attributed == 0]['ip']))

1199

### 1.3 Time feature construction
* Convert UTC time to Asia/Shanghai time
* Extract hour of the day from `click_time` as a separate feature `click_hour` 

In [None]:
# Extract hour of the day
df_train['click_hour'] = df_train['click_time'].dt.hour
df_train.drop(columns=['click_time'], inplace=True)

In [None]:
df_train.head()

In [None]:
cols_corr = [x for x in df_train.columns if x != 'is_attributed']
corr_matrix = df_train[cols_corr].corr()
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(
    corr_matrix, mask=mask, cmap='RdBu', vmin=-1, vmax=1,
    square=True, linewidths=.5,
    cbar_kws={'label':'Pearson\'s r'}
);

In [None]:
df_train.drop(columns=['os'], inplace=True)
df_train.head()

### 1.4 Feature engineering
Construct feature combinations, and use the counts of records belonging to each of the combinations as the new features.

In [None]:
# Generate feature combinations
feature_primary = 'ip'
feature_other = ['device', 'channel', 'app']
feature_combinations = preprocessing.list_feature_combinations(
    feature_primary=feature_primary, feature_other=feature_other
)
print(feature_combinations)

In [None]:
csv_engineered = os.path.join(dir_data, 'train_sample_engineered.csv')
if not os.path.exists(csv_engineered):
    df_engineered = preprocessing.df_engineered(
        df_in=df_train, feature_combinations=feature_combinations
    )
else:
    df_engineered = pd.read_csv(csv_engineered)
display(df_engineered.head(3))

---
## 2. Exploratory data analysis

(1) Train-test split

In [None]:
df_train, df_test = train_test_split(df_engineered, test_size=0.2, random_state=42)

In [None]:
df_train.head()

(2) Visualize click count as function of click time

In [None]:
hist_params = dict(bins=24, edgecolor='w', alpha=0.5)
fig, ax = plt.subplots()
ax = plotter.compare_hist(df_train, by='click_hour', hist_params=hist_params, ax=ax)
ax.set(xlabel='Click time in hour-of-the day (Shanghai time)')
plt.show();

---
## 3. Machine learning

In [None]:
#cols = [x for x in df_train.columns if 'count' in x or 'mean' in x]
cols = [x for x in df_train.columns if 'is_attributed' not in x]
X_train, y_train = (df_train[cols], df_train['is_attributed'])
X_test, y_test = (df_test[cols], df_test['is_attributed'])

In [None]:
cols

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import (StandardScaler, MinMaxScaler)
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import (cross_val_score, RandomizedSearchCV)

In [None]:
model = DecisionTreeClassifier(class_weight='balanced')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(auc)

In [None]:
model = RandomForestClassifier(class_weight='balanced', n_estimators=10, max_depth=8)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(auc)

In [None]:
import xgboost as xgb
train_matrix = xgb.DMatrix(data=X_train, label=y_train)
test_matrix = xgb.DMatrix(data=X_test, label=y_test)
params = dict(objective='binary:logistic', 'max_depth'=8)



model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=, seed=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
print(auc)

In [None]:
from sklearn.feature_selection import RFE
estimator = DecisionTreeClassifier()
selector = RFE(estimator, step=1)
selector = selector.fit(X_train, y_train)

In [None]:
print(selector.support_)
print(selector.ranking_)

In [None]:
col_array = np.array(cols)
col_array

In [None]:
cols = col_array[selector.support_]

In [None]:
X_train, y_train = (df_train[cols], df_train['is_attributed'])
X_test, y_test = (df_test[cols], df_test['is_attributed'])

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
auc = roc_auc_score(y_test, y_pred)
print(auc)

X_train.shape