In [53]:
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
import pandas as pd
from sklearn import datasets, svm, tree, preprocessing, metrics
import sklearn.ensemble as ske

In [22]:
df = pd.read_csv('C:/GitHub/kaggle/titanic/data/train.csv', sep=',', header=0)
df_pred = pd.read_csv('C:/GitHub/kaggle/titanic/data/test.csv', sep=',', header=0)

In [23]:
def preprocess_dataframe(df, print_info=False):
    """
    Description:
    
    Performs preprocessing on the titanic data
    
    PassangerId - Id
    Survived    - Survived  (0 = No; 1 = Yes)
    Pclass      - Passenger Class  (1 = 1st; 2 = 2nd; 3 = 3rd)
    Name        - Name
    Sex         - Sex
    Age         - Age
    Sibsp       - Number of Siblings/Spouses Aboard
    Parch       - Number of Parents/Children Aboard
    Ticket      - Ticket Number
    Fare        - Passenger Fare (British pound)
    Cabin       - Cabin code
    Embarked    - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
    
    Arguments:
    df -- Dataset, Pandas DataFrame
    
    Returns:
    df -- Dataset, Pandas DataFrame
    """

    # Drop column with data deem not relevant for learning
    # Name     - Gender already has its' own column. Only thing that might be interesting here is the title
    # Ticket   - Ticket does not really say much, price is already included which says the most
    # Cabin    - Data is bad and it is hard to translate the numbers into something useful without more info
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # Encode sex into binary (0 = male, 1 = female)
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 0})

    # Split classes with one hot encoding
    # Pclass   - splits into (1 = Pclass_1, 2 = Pclass_2, 3 = Pclass_3)
    # Embarked - splits into (C = Embarked_C, Q = Embarked_Q, S = Embarked_S)
    survival_by_plcass = df.groupby('Pclass').mean()['Survived']
    survival_by_embark = df.groupby('Embarked').mean()['Survived']
    df = pd.get_dummies( df, columns = ['Pclass', 'Embarked'])

    # Age has missing values which is replaced with average
    # Might also consider dividing age into classes of age brackets
    df['Age'].fillna((df['Age'].mean()), inplace = True)
    
    # Normalize the data
    norm_vals = ['Age', 'SibSp', 'Parch', 'Fare']
    df[norm_vals]=(df[norm_vals]-df[norm_vals].min())/(df[norm_vals].max()-df[norm_vals].min())
    
    if print_info:
        print('--------------------------------------------------------------------------------------')
        print('SURVIVAL RATE')
        print('--------------------------------------------------------------------------------------')
        print('Overall survival rate: ' + str(df['Survived'].mean()))
        print()
        print(survival_by_plcass)
        print()
        print(survival_by_embark)
        print('--------------------------------------------------------------------------------------')
        print('DATA INFO')
        print('--------------------------------------------------------------------------------------')
        print(df.info())
        print('--------------------------------------------------------------------------------------')
        print('MISSING VALUES')
        print('--------------------------------------------------------------------------------------')
        print(df.isnull().sum())
        print('--------------------------------------------------------------------------------------')
        print('CORRELATIONS')
        print('--------------------------------------------------------------------------------------')
        print(df.corr())
        print('--------------------------------------------------------------------------------------')

    return df

In [24]:
def split_data(df):
    """
    Description:
    Splits the data into test/training set for simple validation

    Arguments:
    df -- Dataset, pandas dataframe
        
    Returns:
    train -- Training samples, pandas dataframe
    test -- Test samples, pandas dataframe
    """

    df_train = df.sample(frac = 0.8, random_state = 42)
    df_test = df.drop(df_train.index)  
    
    X_train = df_train.drop(['Survived'], axis=1).values
    y_train = df_train['Survived'].values
    
    X_test = df_test.drop(['Survived'], axis=1).values
    y_test = df_test['Survived'].values
    
    return X_train, y_train, X_test, y_test

In [25]:
df_processed = preprocess_dataframe(df, False)

X_train, y_train, X_test, y_test = split_data(df_processed)

print ("X_train shape: " + str(X_train.shape))
print ("y_train shape: " + str(y_train.shape))
print ("X_test shape: " + str(X_test.shape))
print ("y_test shape: " + str(y_test.shape))

X_train shape: (713, 11)
y_train shape: (713,)
X_test shape: (178, 11)
y_test shape: (178,)


In [45]:
def trainDecisionTreeClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds a decision tree classifier and scores it on the test set

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    dt_clf -- Classifier, sklearn DecisionTreeClassifier
    """
    dt_clf = tree.DecisionTreeClassifier(max_depth=10)
    dt_clf.fit (X_train, y_train)
    print(dt_clf.score (X_test, y_test))
    
    return dt_clf

In [48]:
dt_clf = trainDecisionTreeClassifier(X_train, y_train, X_test, y_test)

0.8089887640449438


In [54]:
def randomForestClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds a random forest classifier and scores it on the test set

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    rf_clf -- Classifier, sklearn DecisionTreeClassifier
    """
    rf_clf = ske.RandomForestClassifier(n_estimators=50)
    rf_clf.fit (X_train, y_train)
    print(rf_clf.score (X_test, y_test))
    
    return rf_clf

In [55]:
rf_clf = randomForestClassifier(X_train, y_train, X_test, y_test)

0.8146067415730337


In [61]:
def gradientBoostingClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds a random forest classifier and scores it on the test set

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    rf_clf -- Classifier, sklearn DecisionTreeClassifier
    """
    gb_clf = ske.GradientBoostingClassifier(n_estimators=50)
    gb_clf.fit (X_train, y_train)
    print(gb_clf.score (X_test, y_test))
    
    return gb_clf

In [62]:
gb_clf = gradientBoostingClassifier(X_train, y_train, X_test, y_test)

0.8370786516853933
