In [10]:
# Imports
from helpers import *
from imblearn.over_sampling import RandomOverSampler
import graphviz
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

### Numpy Print Options ###
np.set_printoptions(
    threshold=2000, # 1000
    edgeitems=10, # 3
    linewidth=180, # 75
    precision=2
)

In [11]:
dtypes = {
    'survived': np.uint8,
    'honor': 'category',
    'sex': 'category',
    'clare': np.uint8,
    'age': np.uint8,
    'fare': np.uint8,
    'cabin': 'category',
    'origin': 'category',
    'alone': 'category',
    'port': 'category',
    'sisp': np.uint8,
    'pach': np.uint8
}

In [36]:
train_raw = pd.read_csv('data/cleaned_train.csv', dtype=dtypes, header=0, index_col=0)
test_raw = pd.read_csv('data/cleaned_test.csv', dtype=dtypes, header=0, index_col=0)

In [37]:
train = pd.get_dummies(train_raw)
test = pd.get_dummies(test_raw)
train_men = train.query('sex_male == 1')
train_women = train.query('sex_female == 1')
test_men = test.query('sex_male == 1')
test_women = test.query('sex_female == 1')
X = train.drop('survived', axis=1)
M = train_men.drop('survived', axis=1)
W = train_women.drop('survived', axis=1)
y = train['survived']
m = train_men['survived']
w = train_women['survived']

In [38]:
pct = lambda v: int(v * 100)

def test_data(X, y, title, clf, sampler=None, splits=3):
    kfold = KFold(n_splits=splits, shuffle=True)
    avgs = []
    for train_i, test_i in kfold.split(X):
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        y_train, y_test = y.iloc[train_i], y.iloc[test_i]
        if sampler:
            X_train, y_train = sampler.fit_resample(X_train, y_train)
        model = clf.fit(X_train, y_train)
        preds = model.predict(X_test)
        print(' '.join('{1}{0:.10}'.format(k, int(v * 100)) for v, k in sorted(zip(model.feature_importances_, X.columns), reverse=True)[:6]))
        report = metrics.classification_report(y_test, preds, output_dict=True)
        avgs.append((metrics.accuracy_score(y_test, preds), report['0']['f1-score'], report['1']['f1-score']))
        #print(' '.join('0: {0:.1}: {1:}'.format(k, int(v * 100)) for k, v in report['0'].items()))
        #print(' '.join('1: {0:.1}: {1:}'.format(k, int(v * 100)) for k, v in report['1'].items()))
    avg = [pct(sum(lst)/3) for lst in zip(*avgs)]
    print(avg, ' ac 0f 1f', '='*5, title, '='*5)


def tests():
    test_data(X, y, 'All', abc)
    test_data(X, y, 'All Samp', abc, sampler=samp)
    test_data(X, y, 'All Weighted', wabc)
    print('==')
    test_data(X, y, 'Single Tree', dtc)
    test_data(X, y, 'Single Weighted Tree', wdtc)
    test_data(X, y, 'Single Tree 2', dtc2)
    test_data(X, y, 'Single Weighted Tree 2', wdtc2)
    test_data(X, y, 'Single Samp Tree 2', dtc2, sampler=samp)

In [39]:
dtc = DecisionTreeClassifier(max_depth=1, criterion='entropy', min_samples_split=2, min_samples_leaf=10, class_weight=None)
wdtc = DecisionTreeClassifier(max_depth=1, criterion='entropy', min_samples_split=2, min_samples_leaf=10, class_weight='balanced')
dtc2 = DecisionTreeClassifier(max_depth=None, criterion='entropy', min_samples_split=50, min_samples_leaf=10, class_weight=None)
wdtc2 = DecisionTreeClassifier(max_depth=None, criterion='entropy', min_samples_split=50, min_samples_leaf=10, class_weight='balanced')
abc = AdaBoostClassifier(base_estimator=dtc, n_estimators=50, learning_rate=.1)
wabc = AdaBoostClassifier(base_estimator=wdtc, n_estimators=50, learning_rate=.1)
samp = RandomOverSampler(sampling_strategy='minority')

In [40]:
tests()

30clare 28honor_mr 16age 10sex_female 6honor_mrs 6cabin_0
32honor_mr 24clare 10age 8sex_male 8fare 8cabin_1
40honor_mr 28clare 12cabin_1 6sex_female 6origin_oth 6cabin_0
[78, 82, 71]  ac 0f 1f ===== All =====
42honor_mr 32clare 12cabin_1 6sex_male 6cabin_0 2sex_female
28honor_mr 24clare 10fare 10age 8cabin_0 6sex_male
34honor_mr 30clare 8origin_oth 8cabin_1 8cabin_0 6sex_female
[78, 80, 74]  ac 0f 1f ===== All Samp =====
34honor_mr 28clare 10cabin_1 8sex_male 8cabin_0 4sex_female
34honor_mr 26clare 12age 10cabin_0 6fare 4honor_mrs
34honor_mr 28clare 10sex_male 10cabin_1 8cabin_0 8age
[77, 80, 73]  ac 0f 1f ===== All Weighted =====
==
100honor_mr 0sex_male 0sex_female 0origin_oth 0origin_eng 0honor_mrs
100honor_mr 0sex_male 0sex_female 0origin_oth 0origin_eng 0honor_mrs
100honor_mr 0sex_male 0sex_female 0origin_oth 0origin_eng 0honor_mrs
[79, 82, 73]  ac 0f 1f ===== Single Tree =====
100honor_mr 0sex_male 0sex_female 0origin_oth 0origin_eng 0honor_mrs
100honor_mr 0sex_male 0sex_female 0

In [28]:
# Output Test
X, y = samp.fit_resample(X, y)
model = dtc2.fit(X, y)
preds = model.predict(test)

In [29]:
res = pd.DataFrame(preds, index=test.index, columns=['Survived'])
res.index.names = ['PassengerId']

In [30]:
res.to_csv('preds/preds_tree_oversampled_extra.csv', header=True, index=True)