In [1]:
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

## Read data

In [43]:
train = pd.read_csv(os.path.join(os.path.pardir,'data','raw','train.csv'))
test = pd.read_csv(os.path.join(os.path.pardir,'data','raw','test.csv'))

In [3]:
assert not train['Unnamed: 0'].duplicated().any()

#test = test[~test['Unnamed: 0'].duplicated()]
#assert not test['Unnamed: 0'].duplicated().any()

In [4]:
train = train.set_index('Unnamed: 0')
#test  = test.set_index('Unnamed: 0')

In [5]:
train.shape, train.columns

((576, 5), Index(['Months since Last Donation', 'Number of Donations',
        'Total Volume Donated (c.c.)', 'Months since First Donation',
        'Made Donation in March 2007'],
       dtype='object'))

In [6]:
train.head()

Unnamed: 0_level_0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
619,2,50,12500,98,1
664,0,13,3250,28,1
441,1,16,4000,35,1
160,2,20,5000,45,1
358,1,24,6000,77,0


In [44]:
test.shape, test.columns

((200, 5),
 Index(['Unnamed: 0', 'Months since Last Donation', 'Number of Donations',
        'Total Volume Donated (c.c.)', 'Months since First Donation'],
       dtype='object'))

In [8]:
test.head()

Unnamed: 0_level_0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
659,2,12,3000,52
276,21,7,1750,38
263,4,1,250,4
303,11,11,2750,38
83,4,12,3000,34


In [9]:
cols_target = ['Made Donation in March 2007']
cols_features = list(set(train.columns) - set(cols_target))
cols_features.sort()
cols_features

['Months since First Donation',
 'Months since Last Donation',
 'Number of Donations',
 'Total Volume Donated (c.c.)']

## train/validation split

In [59]:
from sklearn.model_selection import train_test_split
test_size=0. # 0.3 FIXME
x_train, x_valid, y_train, y_valid = train_test_split(train[cols_features], train[cols_target], test_size=test_size)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((576, 4), (0, 4), (576, 1), (0, 1))

## auto-sklearn

In [11]:
import autosklearn.classification
import sklearn.model_selection
import sklearn.metrics

In [None]:
# Auto-sklearn API
# http://automl.github.io/auto-sklearn/dev/api.html
#
# Cross-validation from
# https://github.com/automl/auto-sklearn/blob/master/example/example_crossvalidation.py

automl = autosklearn.classification.AutoSklearnClassifier(
    # time_left_for_this_task=60*60,
    # per_run_time_limit=60,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 5}
)

# raw data
import time
print(time.ctime(), 'start fit')
automl.fit(x_train.values.copy(), y_train.squeeze().copy())
print(time.ctime(), 'start refit')
automl.refit(x_train.values.copy(), y_train.squeeze().copy())
print(time.ctime(), 'end')
    
# log(x+1)
# automl.fit(np.log(x_train.values+1), y_train.squeeze())

print(automl.sprint_statistics())



auto-sklearn results:
  Dataset name: 2d2dcee1b79ffb23ed1259f25c937587
  Metric: accuracy
  Best validation score: 0.779514
  Number of target algorithm runs: 33
  Number of successful target algorithm runs: 31
  Number of crashed target algorithm runs: 2
  Number of target algorithms that exceeded the memory limit: 0
  Number of target algorithms that exceeded the time limit: 0



In [65]:
x_valid = x_train
y_valid = y_train
x_valid.shape, y_valid.shape

((576, 4), (576, 1))

In [73]:
# raw data/
y_hat = automl.predict(x_valid.values)
y_prob = automl.predict_proba(x_valid.values)

# log(x+1)
# y_hat = automl.predict(np.log(x_valid.values+1))
# y_prob = automl.predict_proba(np.log(x_valid.values+1))

print("Accuracy score", sklearn.metrics.accuracy_score(y_valid, y_hat))

y_prob = y_prob[:,automl._automl._classes[0] == 1]

print("Log loss", sklearn.metrics.log_loss(y_valid, y_prob))

# On raw data
# time_left_for_this_task=120, per_run_time_limit=20: Accuracy score 0.75, log loss: 0.55
# time_left_for_this_task=240, per_run_time_limit=20: same
#
# On log(x+1)
# time_left_for_this_task=120, per_run_time_limit=20: Accuracy score 0.70, log loss: 0.62

Accuracy score 0.8402777777777778
Log loss 0.4736615021872018


In [58]:
# print(automl.show_models())

## make a submission from automl

In [67]:
y_pred = automl.predict_proba(test[cols_features].values)
test[cols_target[0]] = y_pred[:,automl._automl._classes[0] == 1]

# test[cols_target[0]].head().reset_index()

In [68]:
to_submit = test[['Unnamed: 0', cols_target[0]]].rename(columns={'Unnamed: 0': ''})
to_submit.head()

Unnamed: 0,Unnamed: 1,Made Donation in March 2007
0,659,0.515962
1,276,0.465728
2,263,0.480432
3,303,0.482545
4,83,0.503988


In [69]:
import datetime as dt

fn = 'submission_%s.csv'%(dt.datetime.today().strftime('%Y%m%d_%H%M%S'))
fn = os.path.join(os.path.pardir, 'data', 'interim', fn)
#fn
to_submit.to_csv(fn, index=False)

from zipfile import ZipFile, ZIP_DEFLATED
with ZipFile('%s.zip'%fn, 'w', ZIP_DEFLATED) as myzip:
    myzip.write(fn)