# Loading modules

In [None]:
from IPython.display import display, HTML

# Visualisation tools
import matplotlib.pyplot as plt
#%matplotlib inline
import seaborn as sns

# Processing tools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score

# sklearn models
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
gender_sub = pd.read_csv('data/gender_submission.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Having a look at inputs

In [None]:
print(train.shape)
print(train.groupby('Survived').size())
train['Cabin'] = len(train['Cabin'].unique())
print('Name:', len(train['Name'].unique()))
print('Ticket:', len(train['Ticket'].unique()))
print('Embarked:', len(train['Embarked'].unique()))


In [None]:
display(gender_sub.head())
display(train.head())
display(test.head())

# Now lets do some quick exploratory analysis with graphs.

In [None]:
fig = plt.figure()
sns.set(color_codes=True)

display(train.groupby('Survived')['Survived'].count())
train_survived = train[train['Survived'] == 1]
train_ded = train[train['Survived'] == 1]

display(train)
sns.distplot(train['Age'].dropna(), label="All")
sns.distplot(train_ded['Age'].dropna(), label="Dead");
sns.distplot(train_survived['Age'].dropna(), label="Survived");
plt.legend()
plt.show()

sns.distplot(train['Age'])
plt.show()
sns.distplot(train['Fare'])
plt.show()

# Selecting and encoding features/labels

In [None]:
gender_sub = pd.read_csv('data/gender_submission.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

train = train.drop(columns=['Name'])
display(train.head())

mean = train['Age'].mean()
train['Age'] = train['Age'].fillna(mean)
train = train.fillna('-1')

categories_to_encode = ['Cabin', 'Embarked', 'Ticket', 'Sex']
for c in categories_to_encode:
    le = LabelEncoder()
    train[c] = le.fit_transform(train[c])

corr = train.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)
plt.show()
print(corr["Survived"].sort_values(ascending=False))

features_to_drop = ['PassengerId', 'SibSp']
features = train.drop(columns=features_to_drop)
labels = features.pop('Survived').values


# Testing different classifier models.

In [None]:
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    DecisionTreeClassifier(),
    #KNeighborsClassifier(),
    #LinearSVC(),
    #SGDClassifier(loss='hinge', class_weight='balanced'),
    #MultinomialNB(),
    GaussianNB(),
    MLPClassifier(),
    #GradientBoostingClassifier(),
    LogisticRegression(random_state=0),
]
CV = 5 # Folds in data
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
print(cv_df.groupby('model_name').accuracy.mean())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    test_size=0.2, 
                                                    random_state=0)
clf = LogisticRegression()
clf.fit(X_train, y_train) 

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_pred, y_test)
print("Model Accuracy:", accuracy)