In [None]:
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

## Read data

In [None]:
train = pd.read_csv(os.path.join(os.path.pardir,'data','raw','train.csv'))
test = pd.read_csv(os.path.join(os.path.pardir,'data','raw','test.csv'))

In [None]:
assert not train['Unnamed: 0'].duplicated().any()

test = test[~test['Unnamed: 0'].duplicated()]
assert not test['Unnamed: 0'].duplicated().any()

In [None]:
train = train.set_index('Unnamed: 0')
test  = test.set_index('Unnamed: 0')

In [None]:
train.shape, train.columns

In [None]:
train.head()

In [None]:
test.shape, test.columns

In [None]:
test.head()

In [None]:
cols_target = ['Made Donation in March 2007']
cols_features = list(set(train.columns) - set(cols_target))
cols_features.sort()
cols_features

## Plot

## train/test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train[cols_features], train[cols_target], test_size=0.3)
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

## sklearn SVM classification

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(x_train, y_train.squeeze())
# ratio of number of correct to total number
clf.score(x_valid, y_valid.squeeze())

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(np.log(x_train+1), y_train.squeeze())
# ratio of number of correct to total number
clf.score(np.log(x_valid+1), y_valid.squeeze())

In [None]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(x_train, y_train.squeeze())
# ratio of number of correct to total number
clf.score(x_valid, y_valid.squeeze())

In [None]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(np.log(x_train+1), y_train.squeeze())
# ratio of number of correct to total number
clf.score(np.log(x_valid+1), y_valid.squeeze())

## sklearn Decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train.squeeze())
# ratio of number of correct to total number
clf.score(x_valid, y_valid.squeeze())

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(np.log(x_train+1), y_train.squeeze())
# ratio of number of correct to total number
clf.score(np.log(x_valid+1), y_valid.squeeze())

## sklearn ensemble method

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train.squeeze())
# ratio of number of correct to total number
clf.score(x_valid, y_valid.squeeze())

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(np.log(x_train+1), y_train.squeeze())
# ratio of number of correct to total number
clf.score(np.log(x_valid+1), y_valid.squeeze())

## sklearn RF embedding followed by bayes

http://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_embedding.html#sphx-glr-auto-examples-ensemble-plot-random-forest-embedding-py
    

In [None]:
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB
import numpy as np

In [None]:
def calc_1(X, y):
    # use RandomTreesEmbedding to transform data
    hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=10)
    X_transformed = hasher.fit_transform(X)

    # Visualize result after dimensionality reduction using truncated SVD
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)

    svd = TruncatedSVD(n_components=2)
    X_raw_2d = svd.fit_transform(X)

    # scatter plot of original and reduced data
    fig = plt.figure(figsize=(9, 8))

    ax = plt.subplot(221)
    ax.scatter(X_raw_2d[:, 0], X_raw_2d[:, 1], c=y, s=50, edgecolor='k')
    ax.set_title("Original Data (2d)")
    ax.set_xticks(())
    ax.set_yticks(())

    ax = plt.subplot(222)
    ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor='k')
    ax.set_title("Truncated SVD reduction (2d) of transformed data (%dd)" %
                 X_transformed.shape[1])
    ax.set_xticks(())
    ax.set_yticks(())
    
    plt.show()
    
    return X_transformed, hasher

In [None]:
X_transformed_raw, hasher_raw = calc_1(     x_train.values, y_train.squeeze())
X_transformed_log, hasher_log = calc_1(np.log(x_train.values + 1), y_train.squeeze())

In [None]:
import gc
gc.collect()

In [None]:
def calc_2(X_transformed, y):
    # Learn a Naive Bayes classifier on the transformed data
    nb = BernoulliNB()
    nb.fit(X_transformed, y)

    # Learn an ExtraTreesClassifier for comparison
    trees = ExtraTreesClassifier(max_depth=30, n_estimators=100, random_state=0)
    trees.fit(X_transformed, y)

    rf = RandomForestClassifier(max_depth=30, n_estimators=100, random_state=0)
    rf.fit(X_transformed, y)
    
    return nb, trees, rf

In [None]:
nb_raw, trees_raw, rf_raw = calc_2(       x_train.values   , y_train.squeeze())
nb_log, trees_log, rf_log = calc_2(np.log(x_train.values+1), y_train.squeeze())

nb_t_raw, trees_t_raw, rf_t_raw = calc_2(X_transformed_raw, y_train.squeeze())
nb_t_log, trees_t_log, rf_t_log = calc_2(X_transformed_log, y_train.squeeze())

In [None]:
y2=y_valid.squeeze()

X2_transformed_raw = hasher_raw.transform(       x_valid.values)
X2_transformed_log = hasher_log.transform(np.log(x_valid.values + 1))

In [None]:
nb_raw.score(x_valid.values, y2), trees_raw.score(x_valid.values, y2), rf_raw.score(x_valid.values, y2)

In [None]:
x_in = np.log(x_valid.values+1)
nb_log.score(x_in, y2), trees_log.score(x_in, y2), rf_log.score(x_in, y2)

In [None]:
nb_t_raw.score(X2_transformed_raw, y2), trees_t_raw.score(X2_transformed_raw, y2), rf_t_raw.score(X2_transformed_raw, y2)

In [None]:
nb_t_log.score(X2_transformed_log, y2), trees_t_log.score(X2_transformed_log, y2), rf_t_log.score(X2_transformed_log, y2)

## auto-sklearn

In [None]:
import autosklearn.classification
import sklearn.model_selection
import sklearn.metrics

In [None]:
automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=120, per_run_time_limit=10)
automl.fit(x_train.values, y_train.squeeze())
y_hat = automl.predict(x_valid.values)
print("Accuracy score", sklearn.metrics.accuracy_score(y_valid, y_hat))

In [None]:
# print(automl.show_models())

### try on log

In [None]:
import time
print(time.ctime(), 'start')
automl_log = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=120, per_run_time_limit=10)
automl_log.fit(np.log(x_train.values+1), y_train.squeeze())
y_hat2 = automl_log.predict(np.log(x_valid.values+1))
print(time.ctime(), 'end')
print("Accuracy score", sklearn.metrics.accuracy_score(y_valid, y_hat2))

## make a submission from automl

In [None]:
y_pred = automl.predict_proba(test[cols_features].values)
y_pred.shape
#test[cols_target[0]] = y_pred
#test.head()