In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

## Read data

In [None]:
from src.features import load_raw, imply_columns

df = load_raw()

from src.features import append_features

df['train'], df['submit'] = map(append_features, [df['train'], df['submit']])

cols = imply_columns(df)

In [None]:
df['train'].shape, df['train'].columns

In [None]:
df['train'].head()

In [None]:
df['submit'].shape, df['submit'].columns

In [None]:
df['submit'].head()

In [None]:
cols

## train/validation split

In [None]:
from sklearn.model_selection import train_test_split
test_size=0. # 0.3 FIXME
x_train, x_valid, y_train, y_valid = train_test_split(df['train'][cols['features']], df['train'][cols['target']], test_size=test_size)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

In [None]:
x_train = x_train.values
y_train = y_train.squeeze()
x_submit = df['submit'][cols['features']].values

In [None]:
# preprocess with Random Trees Embedding
from sklearn.ensemble import RandomTreesEmbedding
rte = RandomTreesEmbedding(
    n_estimators=100, 
    max_depth=50, 
    min_samples_split=20,
    min_samples_leaf=10, 
    min_weight_fraction_leaf=0.0, 
    max_leaf_nodes=None, 
    min_impurity_decrease=0.0, 
    min_impurity_split=None, 
    sparse_output=True, 
    n_jobs=1,
    random_state=None, 
    verbose=0, 
    warm_start=False
)
x_train = rte.fit(x_train)

preprocess = lambda x_in: rte.transform(x_in)
x_train, x_submit = map(preprocess, [x_train, x_submit])

## auto-sklearn

In [None]:
import autosklearn.classification
import sklearn.model_selection
import sklearn.metrics

In [None]:
# Auto-sklearn API
# http://automl.github.io/auto-sklearn/dev/api.html
#
# Cross-validation from
# https://github.com/automl/auto-sklearn/blob/master/example/example_crossvalidation.py

automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=30*60,
    per_run_time_limit=30,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 3}
)

# raw data
import time
print(time.ctime(), 'start fit')
automl.fit(x_train.copy(), y_train.copy())
print(time.ctime(), 'start refit')
automl.refit(x_train.copy(), y_train.copy())
print(time.ctime(), 'end')
    
# log(x+1)
# automl.fit(np.log(x_train.values+1), y_train.squeeze())

print(automl.sprint_statistics())

In [None]:
x_valid = x_train
y_valid = y_train
x_valid.shape, y_valid.shape

In [None]:
# raw data/
y_hat = automl.predict(x_valid)
y_prob = automl.predict_proba(x_valid)

# log(x+1)
# y_hat = automl.predict(np.log(x_valid.values+1))
# y_prob = automl.predict_proba(np.log(x_valid.values+1))

print("Accuracy score", sklearn.metrics.accuracy_score(y_valid, y_hat))

y_prob = y_prob[:,automl._automl._classes[0] == 1]

print("Log loss", sklearn.metrics.log_loss(y_valid, y_prob))

In [None]:
# print(automl.show_models())

## make a submission from automl

In [None]:
y_pred = automl.predict_proba(df['submit'][cols['features']].values)
df['submit'][cols['target'][0]] = y_pred[:,automl._automl._classes[0] == 1]

# test[cols_target[0]].head().reset_index()

In [None]:
df['upload'] = df['submit'][['Unnamed: 0', cols['target'][0]]].rename(columns={'Unnamed: 0': ''})
df['upload'].head()

In [None]:
from src.features import make_submission
make_submission(df['upload'])

## statsmodels GLM

- Idea from https://github.com/jthalstead/DrivenData---Blood-Donations/blob/master/blood_single_glm.R
- by user https://www.drivendata.org/users/jackh/
- also, all the 0.1311 submissions are with the hack of using an external dataset from UCI, as listed [here](https://community.drivendata.org/t/using-uci-data-to-achieve-0-1311-on-lb-just-for-fun-here/883)

In [None]:
x_train.dtype, y_train.dtype

In [None]:
import statsmodels.api as sm
gamma_model = sm.GLM(x_train, y_train, family=sm.families.Gamma())
gamma_results = gamma_model.fit()

print(gamma_results.summary())