In [None]:
# Imports
import csv
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

### Numpy Print Options ###
np.set_printoptions(
    threshold=2000, # 1000
    edgeitems=10, # 3
    linewidth=180, # 75
    precision=2
)

In [None]:
# Read data into data frame
train = pd.read_csv('cleaned_train.csv', sep=',', header=0, index_col=0)
test = pd.read_csv('cleaned_test.csv', sep=',', header=0, index_col=0)
men = train.query('sex == 0').copy()
women = train.query('sex == 1').copy()
men_test = test.query('sex == 0').copy()
women_test = test.query('sex == 1').copy()
X_train = train.iloc[:, 1:].copy()
X_test = test
X_men = men.iloc[:, 1:].copy()
X_women = women.iloc[:, 1:].copy()
y = train['survived']
y_men = men['survived']
y_women = women['survived']
X_all = pd.concat((X_train, X_test), axis=0)

In [None]:
# Impute missing values

# fare=6 => fare=0 : missing fare was 3rd class old Mr. w/out cabin, alone, ticket 4
impute_fare = lambda df: df.fare.replace(6, 0, inplace=True)

# port=3 => port=1 : missing fare were 2nd class old women w/cabin fare 5 ticket 6 
impute_port = lambda df: df.port.replace(3, 1, inplace=True)

# origin=0 => none

# age=14 => mean age for group with same pclass honor sisp pach cabin
def impute_age(df):
    features = ['pclass', 'honor', 'sisp', 'pach', 'cabin', 'age']
    base = df[df.age != 14][features]
    missing = df[df.age == 14][features]
    for pid, obs in missing.iterrows():
        pclass, honor, sisp, pach, cabin, _ = obs
        matches = base.query('pclass == @pclass and honor == @honor and sisp == @sisp and pach == @pach and cabin == @cabin')
        if matches.empty:
            matches = base[(base.pclass == pclass) & (base.honor == honor)]    
        mean_age = int(round(matches.age.mean()))
        df.loc[pid, 'age'] = mean_age

# Derived features

# Travelling alone? sisp=0 and pach=0
def add_alone(df):
    df['alone'] = ((df.sisp == 0) & (df.pach == 0)).astype(int)
    return df

# How many family members?
def add_family(df):
    df['family'] = df.sisp + df.pach
    return df

# Give female children separate honorific
# honor == 4 (Miss) and age < 4 ==> honor = 6
def add_girls_honor(df):
    girls = df.query('honor == 4 and age < 5')
    for pid in girls.index:
        df.loc[pid].honor = 6

# Better class: combine pclass and cabin
def add_pclabin(df):
    df['pclabin'] = df.pclass + df.cabin * 3
    return df

def add_magic(df):
    df['magic'] = 1
    men = df.query('sex == 0')
    women = df.query('sex == 1')
    alive_men = men.query('honor == 3 or (pclass == 1 and age < 11) or age < 4 or fare == 5 or cabin == 1 or pclabin == 4')
    dead_women = women.query('(pclass == 0 and (origin == 2 or origin == 3 or origin == 5)) or (pclass == 0 and alone == 0) or sisp == 3 or honor == 6')
    for pid in alive_men.index:
        df.loc[pid, 'magic'] = 0
    for pid in dead_women.index:
        df.loc[pid, 'magic'] = 0
    return df

def adjust_features(old_df):
    df = old_df.copy(deep=True)
    impute_fare(df)
    impute_port(df)
    impute_age(df)
    add_girls_honor(df)
    df = add_family(df)
    df = add_alone(df)
    df = add_pclabin(df)
    df = add_magic(df)
    return df

In [None]:
counts = lambda df: df.apply(pd.Series.value_counts, axis=0)

def get_errors(X, y, model):
    errors = []
    for i in range(X.shape[0]):
        obs = X.iloc[i:i+1]
        real = y.iloc[i]
        y_pred = model.predict(obs)
        if y_pred != [real]:
            errors.append(i)
    errs = pd.concat([X.iloc[errors], y.iloc[errors]], axis=1, join='outer')
    print('Errors:', errs.shape[0])
    errs.sort_values('survived', inplace=True)
    return errs

def test_data(X, y, label, clf, splits=4):
    print('\n', '='*10, label, '='*10)
    kfold = KFold(n_splits=splits)
    for train_i, test_i in kfold.split(X):
        X_train, X_test = X.iloc[train_i], X.iloc[test_i]
        y_train, y_test = y.iloc[train_i], y.iloc[test_i]
        model = clf.fit(X_train, y_train)
        preds = model.predict(X_test)
        #print(' '.join('{0: <5.5}{1}'.format(k, v) for v, k in sorted(zip(model.feature_importances_, X.columns), reverse=True)))
        print(metrics.accuracy_score(y_test, preds))
        print(metrics.classification_report(y_test, preds))

def sample_features(X, y, clf, population, repeat, fixed=None, size=6):
    for _ in range(repeat):
        sample = random.sample(population, size)
        if fixed is not None:
            sample.append(fixed)
        X_sample = X.iloc[:, sample]
        test_data(X_sample, y, clf)

In [None]:
tree = DecisionTreeClassifier(max_depth=1, criterion='gini')
abc = AdaBoostClassifier(base_estimator=tree, n_estimators=50, learning_rate=.1)
lrc = LogisticRegression(class_weight=None)

In [None]:
train_adj = adjust_features(train)
men_adj = adjust_features(men)
women_adj = adjust_features(women)
men_adj_test = adjust_features(men_test)
women_adj_test = adjust_features(women_test)
X_adj = adjust_features(X_train)
M_adj = X_adj.query('sex == 0')
W_adj = X_adj.query('sex == 1')
print('Men:', M_adj.shape, 'Women:', W_adj.shape)

In [None]:
M_drop = M_adj.drop(columns=['pach', 'alone', 'sex', 'cabin'])
W_drop = W_adj.drop(columns=['pach', 'alone', 'sex', 'cabin'])
men_drop_test = men_adj_test.drop(columns=['pach', 'alone', 'sex', 'cabin'])
women_drop_test = women_adj_test.drop(columns=['pach', 'alone', 'sex', 'cabin'])

In [None]:
test_data(W_drop, y_women, label='Women Adjusted', clf=abc)
test_data(W_pad, y_pad, label='Pad Adjusted', clf=abc)

In [None]:
g = sns.catplot(x='fare', hue='survived', col='magic', data=men_adj, kind='count', palette='husl')

In [None]:
abc_men = abc.fit(M_drop, y_men)
abc_preds_men = abc_men.predict(M_drop)

In [None]:
abc_women = abc.fit(W_drop, y_women)
abc_preds_women = abc_women.predict(W_drop)

In [None]:
err_women = get_errors(W_drop, y_women, abc_women)

In [None]:
err_new = get_errors(W_pad, y_pad, abc.fit(W_pad, y_pad))

In [None]:
err_new

In [None]:
new_pids = np.array([np.arange(1400, 1453)])

In [None]:
new_obs = pd.DataFrame(np.concatenate((new_pids.T, err_women.values), axis=1))
new_obs.columns = ['pid', 'pclass', 'honor', 'origin', 'age', 'sisp', 'ticket', 'fare', 'port', 'family', 'pclabin', 'magic', 'survived']
new_obs = new_obs.set_index('pid')

In [None]:
temp = pd.concat([W_drop, y_women], axis=1, join='outer')

In [None]:
temp2 = pd.concat([temp, new_obs], axis=0, join='outer')

In [None]:
W_pad = temp2.iloc[:, :-1].copy()
y_pad = temp2['survived']

In [None]:
encoder = OneHotEncoder()
encoder_men = encoder.fit(M_drop)
encoder_women = encoder.fit(W_drop)

In [None]:
M_hot = encoder_men.transform(M_drop)
W_hot = encoder_women.transform(W_drop)
men_hot_test = encoder_men.transform(men_drop_test)
women_hot_test = encoder_women.transform(women_drop_test)

In [None]:
lrc_men = lrc.fit(M_hot, y_men)
lrc_preds_men = lrc_men.predict(men_hot_test)

In [None]:
lrc_women = lrc.fit(W_hot, y_women)
lrc_preds_women = lrc_women.predict(women_hot_test)

In [None]:
abc_pm = abc_men.predict(M_drop)

In [None]:
lrc_pm = lrc_men.predict(M_hot)

In [None]:
abc_pw = abc_women.predict(W_drop)

In [None]:
lrc_pw = lrc_women.predict(W_hot)

In [None]:
abc_pm_eq = abc_pm_prev == abc_pm
lrc_pm_eq = lrc_pm_prev == lrc_pm
abc_pw_eq = abc_pw_prev == abc_pw
lrc_pw_eq = lrc_pw_prev == lrc_pw
#print(abc_pm_eq, lrc_pm_eq, abc_pw_eq, lrc_pw_eq)

In [None]:
abc_pm_prev = abc_pm.copy()
lrc_pm_prev = lrc_pm.copy()
abc_pw_prev = abc_pw.copy()
lrc_pw_prev = lrc_pw.copy()

In [None]:
print(metrics.accuracy_score(y_men, abc_pm))
print(metrics.accuracy_score(y_men, lrc_pm))
print(metrics.accuracy_score(y_women, abc_pw))
print(metrics.accuracy_score(y_women, lrc_pw))

In [None]:
pids_men = men_drop_test.index
pids_women = women_drop_test.index
csv.register_dialect('ints', delimiter=',', escapechar=None, quoting=csv.QUOTE_NONE)
with open('preds10.csv', mode='a+', errors='ignore') as pred:
    fieldnames = ['PassengerId', 'Survived']
    writer = csv.writer(pred, dialect='ints')
    pred.truncate(0) # delete any file contents
    writer.writerow(fieldnames)
    for row in zip(pids_men, lrc_preds_men):
        writer.writerow(row)
    for row in zip(pids_women, lrc_preds_women):
        writer.writerow(row)