In [0]:
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

plt.style.use('ggplot')
sns.set_style('white')

In [0]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
# Get the file
downloaded_train = drive.CreateFile({'id':'1tXxl8wCuQyd5ldORKeBCyRy3n-kJ8Xbu'}) 
downloaded_train.GetContentFile('Kaggle_Titanic_train.csv')

downloaded_test = drive.CreateFile({'id':'1EQ0F86NM2QIByUX40A3YgtoqkKMMY40v'}) 
downloaded_test.GetContentFile('Kaggle_Titanic_test.csv')

In [0]:
# Read file as panda dataframe
import pandas as pd
df_train = pd.read_csv('Kaggle_Titanic_train.csv')
df_test = pd.read_csv('Kaggle_Titanic_test.csv')

In [0]:
# Inspect
df_train.head(2)
df_train.info()

df_test.head(2)
df_test.info()

In [0]:
# General EDA
print('Prop Survived')
df_train.Survived.mean()
print('\n')

print('Ticket Class')
df_train.Pclass.value_counts()
print('\n')

print('Sex')
df_train.Sex.value_counts()
print('\n')

print('Age Dist')
df_train.Age.hist(), plt.show()
print('\n')

print('#siblings')
df_train.SibSp.value_counts()
print('\n')

print('#parents')
df_train.Parch.value_counts()
print('\n')

print('Fare Dist')
df_train.Fare.hist(), plt.show()
print('\n')

print('Embarked')
df_train.Embarked.value_counts()

In [0]:
# Numerical EDA
df_train.describe()

In [0]:
# Correlation heatmap
sns.heatmap(df_train.corr())

In [0]:
# Plot distns of Age vs Sex & Survived
facet = sns.FacetGrid(df_train , aspect=4, hue='Survived', row='Sex')
facet.map(sns.kdeplot, 'Age', shade=True)
facet.add_legend()
facet.set(xlim=(0, df_train.Age.max()))

In [0]:
# Plot distns of Fare vs Survived
facet = sns.FacetGrid(df_train , aspect=2, size=4, hue='Survived')
facet.map(sns.kdeplot, 'Fare', shade=True)
facet.add_legend()
facet.set(xlim=(0, df_train.Fare.max()), xscale='log')

In [0]:
# Plot distns of Survived vs categoricals
facet = sns.FacetGrid(df_train , aspect=1, size=4)
facet.map(sns.barplot, 'Embarked', 'Survived')

facet = sns.FacetGrid(df_train , aspect=1, size=4)
facet.map(sns.barplot, 'Sex', 'Survived')

facet = sns.FacetGrid(df_train , aspect=1, size=4)
facet.map(sns.barplot, 'Pclass', 'Survived')

facet = sns.FacetGrid(df_train , aspect=1, size=4)
facet.map(sns.barplot, 'SibSp', 'Survived')

facet = sns.FacetGrid(df_train , aspect=1, size=4)
facet.map(sns.barplot, 'Parch', 'Survived')

In [0]:
# Extract Titles from 'Name' variable
df_train['Title'] = df_train.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
# df_train.Title.value_counts()

# Agglomerate Titles to more generic categories
Title_Dict = {"Capt":"Officer","Col":"Officer","Major":"Officer","Jonkheer":"Royalty","Don":"Royalty","Sir":"Royalty",\
              "Dr":"Officer","Rev":"Officer","the Countess":"Royalty","Dona":"Royalty","Mme":"Mrs","Mlle":"Miss",\
              "Ms":"Mrs","Mr" :"Mr","Mrs" :"Mrs","Miss" :"Miss","Master" :"Master","Lady" :"Royalty"}

df_train['Title'] = df_train.Title.map(Title_Dict)

# Drop 'Name' column
df_train.drop('Name', axis=1, inplace=True)

df_train.head(2)

In [0]:
# Create family size
df_train['FamSize'] = df_train.SibSp + df_train.Parch

# Aggregate SibSp & Parch & FamSize vars
df_train['SibSp_0'] = df_train.SibSp.map(lambda x: 1 if x==0 else 0)
df_train['SibSp_1'] = df_train.SibSp.map(lambda x: 1 if x==1 else 0)
df_train['SibSp_2'] = df_train.SibSp.map(lambda x: 1 if x==2 else 0)
df_train['SibSp_3+'] = df_train.SibSp.map(lambda x: 1 if x>=3 else 0)


df_train['Parch_0'] = df_train.Parch.map(lambda x: 1 if x==0 else 0)
df_train['Parch_1'] = df_train.Parch.map(lambda x: 1 if x==1 else 0)
df_train['Parch_2'] = df_train.Parch.map(lambda x: 1 if x==2 else 0)
df_train['Parch_3+'] = df_train.Parch.map(lambda x: 1 if x>=3 else 0)


df_train['FamSize_0'] = df_train.Parch.map(lambda x: 1 if x==0 else 0)
df_train['FamSize_1'] = df_train.Parch.map(lambda x: 1 if x==1 else 0)
df_train['FamSize_2'] = df_train.Parch.map(lambda x: 1 if x==2 else 0)
df_train['FamSize_3+'] = df_train.Parch.map(lambda x: 1 if x>=3 else 0)

df_train.drop(['SibSp', 'Parch', 'FamSize'], axis=1, inplace=True)

In [0]:
# Solicit info from Ticket var

# df_train.Ticket.value_counts()

def cleanTicket(ticket):
    ticket = ticket.replace('.','').replace('/','').lower()
    ticket = ticket.split()
    ticket = list(filter(lambda t: not t.isdigit(), ticket))
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'XXX'

df_train['Ticket_clean'] = df_train.Ticket.map(cleanTicket)

df_train['Ticket_clean_a'] = df_train.Ticket_clean.map(lambda x: 1 if x[0]=='a' else 0)
df_train['Ticket_clean_c'] = df_train.Ticket_clean.map(lambda x: 1 if x[0]=='c' else 0)
df_train['Ticket_clean_f'] = df_train.Ticket_clean.map(lambda x: 1 if x[0]=='f' else 0)
df_train['Ticket_clean_line'] = df_train.Ticket_clean.map(lambda x: 1 if x[0]=='l' else 0)
df_train['Ticket_clean_p'] = df_train.Ticket_clean.map(lambda x: 1 if x[0]=='p' else 0)
df_train['Ticket_clean_sc'] = df_train.Ticket_clean.map(lambda x: 1 if x[:2]=='sc' else 0)
df_train['Ticket_clean_so'] = df_train.Ticket_clean.map(lambda x: 1 if x[0]=='s' and x[:2]!='sc' else 0)
df_train['Ticket_clean_w'] = df_train.Ticket_clean.map(lambda x: 1 if x[0]=='w' else 0)

df_train.drop(['Ticket', 'Ticket_clean'], axis=1, inplace=True)

In [0]:
# Solicit info from Cabin var

# df_train.Cabin.value_counts()

def cleanCabin(cabin):
    return cabin[0]


df_train.Cabin.fillna('U', inplace=True)
df_train['Cabin_clean'] = df_train.Cabin.map(cleanCabin)

df_train.drop('Cabin', axis=1, inplace=True)

In [0]:
# Covert categoricals into dummies
df_train_cat = pd.get_dummies(df_train, columns=['Sex', 'Pclass', 'Embarked', 'Title', 'Cabin_clean'], drop_first=True)\
                .set_index('PassengerId')

df_train_cat.info()

In [0]:
# Train-test split w/in df_train
from sklearn.model_selection import train_test_split

# Drop difficult vars for first model
X_train, X_test, y_train, y_test = train_test_split(df_train_cat.drop('Survived', axis=1), df_train.Survived, \
                                                    test_size=0.3, random_state=42)

In [0]:
# First pipeline w/ LogReg
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

pipeline = make_pipeline(Imputer(), LogisticRegression())
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Test accuracy of basic LogReg model
from sklearn.metrics import accuracy_score
print('\n')
print('Accuracy score: ', accuracy_score(y_test, y_pred))

In [0]:
# Decision Tree for feature importances
imp = Imputer()
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)
# tree = DecisionTreeClassifier()
tree = RandomForestClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
print('\n')
print('Accuracy score: ', accuracy_score(y_test, y_pred))

from sklearn.metrics import confusion_matrix
print('Confusion matrix:')
confusion_matrix(y_test, y_pred)
print('\n')

# Feature Importances
feat_import = pd.Series(data=tree.feature_importances_, index=df_train_cat.drop('Survived', axis=1).columns)\
                  .sort_values(ascending=True)
plt.figure()
feat_import.plot.barh()
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from DecisionTreeClf')

In [0]:
# Create submission data

df_test = pd.read_csv('Kaggle_Titanic_test.csv')

# Extract Titles from 'Name' variable
df_test['Title'] = df_test.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Agglomerate Titles to more generic categories
Title_Dict = {"Capt":"Officer","Col":"Officer","Major":"Officer","Jonkheer":"Royalty","Don":"Royalty","Sir":"Royalty",\
              "Dr":"Officer","Rev":"Officer","the Countess":"Royalty","Dona":"Royalty","Mme":"Mrs","Mlle":"Miss",\
              "Ms":"Mrs","Mr" :"Mr","Mrs" :"Mrs","Miss" :"Miss","Master" :"Master","Lady" :"Royalty"}

df_test['Title'] = df_test.Title.map(Title_Dict)

# Drop 'Name' column
df_test.drop('Name', axis=1, inplace=True)


# Create family size
df_test['FamSize'] = df_test.SibSp + df_test.Parch

# Aggregate SibSp & Parch & FamSize vars
df_test['SibSp_0'] = df_test.SibSp.map(lambda x: 1 if x==0 else 0)
df_test['SibSp_1'] = df_test.SibSp.map(lambda x: 1 if x==1 else 0)
df_test['SibSp_2'] = df_test.SibSp.map(lambda x: 1 if x==2 else 0)
df_test['SibSp_3+'] = df_test.SibSp.map(lambda x: 1 if x>=3 else 0)


df_test['Parch_0'] = df_test.Parch.map(lambda x: 1 if x==0 else 0)
df_test['Parch_1'] = df_test.Parch.map(lambda x: 1 if x==1 else 0)
df_test['Parch_2'] = df_test.Parch.map(lambda x: 1 if x==2 else 0)
df_test['Parch_3+'] = df_test.Parch.map(lambda x: 1 if x>=3 else 0)


df_test['FamSize_0'] = df_test.Parch.map(lambda x: 1 if x==0 else 0)
df_test['FamSize_1'] = df_test.Parch.map(lambda x: 1 if x==1 else 0)
df_test['FamSize_2'] = df_test.Parch.map(lambda x: 1 if x==2 else 0)
df_test['FamSize_3+'] = df_test.Parch.map(lambda x: 1 if x>=3 else 0)

df_test.drop(['SibSp', 'Parch', 'FamSize'], axis=1, inplace=True)


# Solicit info from Ticket var
def cleanTicket(ticket):
    ticket = ticket.replace('.','').replace('/','').lower()
    ticket = ticket.split()
    ticket = list(filter(lambda t: not t.isdigit(), ticket))
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'XXX'

df_test['Ticket_clean'] = df_test.Ticket.map(cleanTicket)

df_test['Ticket_clean_a'] = df_test.Ticket_clean.map(lambda x: 1 if x[0]=='a' else 0)
df_test['Ticket_clean_c'] = df_test.Ticket_clean.map(lambda x: 1 if x[0]=='c' else 0)
df_test['Ticket_clean_f'] = df_test.Ticket_clean.map(lambda x: 1 if x[0]=='f' else 0)
df_test['Ticket_clean_line'] = df_test.Ticket_clean.map(lambda x: 1 if x[0]=='l' else 0)
df_test['Ticket_clean_p'] = df_test.Ticket_clean.map(lambda x: 1 if x[0]=='p' else 0)
df_test['Ticket_clean_sc'] = df_test.Ticket_clean.map(lambda x: 1 if x[:2]=='sc' else 0)
df_test['Ticket_clean_so'] = df_test.Ticket_clean.map(lambda x: 1 if x[0]=='s' and x[:2]!='sc' else 0)
df_test['Ticket_clean_w'] = df_test.Ticket_clean.map(lambda x: 1 if x[0]=='w' else 0)

df_test.drop(['Ticket', 'Ticket_clean'], axis=1, inplace=True)


# Solicit info from Cabin var
def cleanCabin(cabin):
    return cabin[0]


df_test.Cabin.fillna('U', inplace=True)
df_test['Cabin_clean'] = df_test.Cabin.map(cleanCabin)

df_test.drop('Cabin', axis=1, inplace=True)


# Covert categoricals into dummies
df_test_cat = pd.get_dummies(df_test, columns=['Sex', 'Pclass', 'Embarked', 'Title', 'Cabin_clean'], drop_first=True)\
                .set_index('PassengerId')


# Make predictions for submission
imp = Imputer()
X_model = imp.fit_transform(df_train_cat.drop(['Survived', 'Cabin_clean_T'], axis=1))
y_model = df_train.Survived
X_submission = imp.fit_transform(df_test_cat)
# tree = DecisionTreeClassifier()
tree = RandomForestClassifier()
tree.fit(X_model, y_model)
y_submission = tree.predict(X_submission)

In [0]:
df_submission = pd.DataFrame({'PassengerId': df_test_cat.index, 'Survived': y_submission})
df_submission.to_csv('titanic_submission.csv', index = False)

from google.colab import files
files.download('titanic_submission.csv')