In [172]:
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
import pandas as pd
import sklearn as sk
import sklearn.ensemble as ske
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [173]:
# Removes a warning in sklearn that will be fixed during an update mid 2018
import warnings

if __name__ == '__main__':
    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
    le = sk.preprocessing.LabelEncoder()
    le.fit([1, 2, 2, 6])
    le.transform([1, 1, 2, 6])
    le.inverse_transform([0, 0, 1, 2])

In [174]:
df = pd.read_csv('C:/GitHub/kaggle/titanic/data/train.csv', sep=',', header=0)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [175]:
def preprocess_dataframe(df, prediction_data=False, print_info=False):
    """
    Description:
    
    Performs preprocessing on the titanic data
    
    PassengerId - Id (Only available on training data)
    Survived    - Survived  (0 = No; 1 = Yes)
    Pclass      - Passenger Class  (1 = 1st; 2 = 2nd; 3 = 3rd)
    Name        - Name
    Sex         - Sex
    Age         - Age
    Sibsp       - Number of Siblings/Spouses Aboard
    Parch       - Number of Parents/Children Aboard
    Ticket      - Ticket Number
    Fare        - Passenger Fare (British pound)
    Cabin       - Cabin code
    Embarked    - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
    
    Arguments:
    df -- Dataset, Pandas DataFrame
    
    Returns:
    df -- Dataset, Pandas DataFrame
    """

    # Drop columns with data deemed not relevant for learning
    # Name     - Gender already has its' own column. Only thing that might be interesting here is the title
    # Ticket   - Ticket does not really say much, price and class are already included which says the most
    # Cabin    - Data is bad and it is hard to translate the numbers into something useful without more info
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

    # Encode sex into binary (0 = male, 1 = female)
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 0})

    # Gather info on the significance of these classes for survival
    # Class was, as expected, significance for surival with the rates (1st - 63%, 2nd - 47%, 3rd - 24%)
    # Embarked was suprisingly significant, C - Cherbourg had 55% surivial rate when the mean was just 38%
    if not prediction_data:
        survival_by_plcass = df.groupby('Pclass').mean()['Survived']
        survival_by_embark = df.groupby('Embarked').mean()['Survived']
    
    # Split classes with one hot encoding
    # Pclass   - splits into (1 = Pclass_1, 2 = Pclass_2, 3 = Pclass_3)
    # Embarked - splits into (C = Embarked_C, Q = Embarked_Q, S = Embarked_S)
    df = pd.get_dummies( df, columns = ['Pclass', 'Embarked'])

    # Age has missing values which is replaced with average
    # Might also consider dividing age into classes of age brackets
    df['Age'].fillna((df['Age'].mean()), inplace = True)
    df['Fare'].fillna((df['Fare'].mean()), inplace = True)
    
    # Normalize the data
    norm_vals = ['Age', 'SibSp', 'Parch', 'Fare']
    df[norm_vals]=(df[norm_vals]-df[norm_vals].min())/(df[norm_vals].max()-df[norm_vals].min())
    
    if print_info:
        if not prediction_data:
            print('--------------------------------------------------------------------------------------')
            print('SURVIVAL RATE')
            print('--------------------------------------------------------------------------------------')
            print('Overall survival rate: ' + str(df['Survived'].mean()))
            print()
            print(survival_by_plcass)
            print()
            print(survival_by_embark)
        print('--------------------------------------------------------------------------------------')
        print('DATA INFO')
        print('--------------------------------------------------------------------------------------')
        print(df.info())
        print('--------------------------------------------------------------------------------------')
        print('MISSING VALUES')
        print('--------------------------------------------------------------------------------------')
        print(df.isnull().sum())
        print('--------------------------------------------------------------------------------------')
        print('CORRELATIONS')
        print('--------------------------------------------------------------------------------------')
        print(df.corr())
        print('--------------------------------------------------------------------------------------')

    return df

In [176]:
def split_data(df):
    """
    Description:
    Splits the data into test/training set for simple validation

    Arguments:
    df -- Dataset, pandas dataframe
        
    Returns:
    train -- Training samples, pandas dataframe
    test -- Test samples, pandas dataframe
    """

    df_train = df.sample(frac = 0.8, random_state = 42)
    df_test = df.drop(df_train.index)  
    
    X_train = df_train.drop(['Survived'], axis=1).values
    y_train = df_train['Survived'].values
    
    X_test = df_test.drop(['Survived'], axis=1).values
    y_test = df_test['Survived'].values
    
    return X_train, y_train, X_test, y_test

In [177]:
df_processed = preprocess_dataframe(df, False, False)

X_train, y_train, X_test, y_test = split_data(df_processed)

print ("X_train shape: " + str(X_train.shape))
print ("y_train shape: " + str(y_train.shape))
print ("X_test shape: " + str(X_test.shape))
print ("y_test shape: " + str(y_test.shape))

df_processed.head(3)

X_train shape: (713, 11)
y_train shape: (713,)
X_test shape: (178, 11)
y_test shape: (178,)


Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0,0,0.271174,0.125,0.0,0.014151,0,0,1,0,0,1
1,1,1,0.472229,0.125,0.0,0.139136,1,0,0,1,0,0
2,1,1,0.321438,0.0,0.0,0.015469,0,0,1,0,0,1


In [178]:
def trainDecisionTreeClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a decision tree classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    dt_clf -- Classifier, sklearn DecisionTreeClassifier
    """
    dt_clf = sk.tree.DecisionTreeClassifier(max_depth=10)
    dt_clf.fit (X_train, y_train)
    print(dt_clf.score (X_test, y_test))
    
    return dt_clf

In [179]:
dt_clf = trainDecisionTreeClassifier(X_train, y_train, X_test, y_test)

0.7865168539325843


In [180]:
def randomForestClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a random forest classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    rf_clf -- Classifier, sklearn RandomForestClassifier
    """
    rf_clf = ske.RandomForestClassifier(n_estimators=50)
    rf_clf.fit (X_train, y_train)
    print(rf_clf.score (X_test, y_test))
    
    return rf_clf

In [181]:
rf_clf = randomForestClassifier(X_train, y_train, X_test, y_test) 

0.8258426966292135


In [182]:
def gradientBoostingClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a gradient boosting classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    rf_clf -- Classifier, sklearn GradiantBoostingClassifier
    """
    gb_clf = ske.GradientBoostingClassifier(n_estimators=50)
    gb_clf.fit (X_train, y_train)
    print(gb_clf.score (X_test, y_test))
    
    return gb_clf

In [183]:
gb_clf = gradientBoostingClassifier(X_train, y_train, X_test, y_test) 

0.8370786516853933


In [184]:
def logisticRegressionClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a logistic regression classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    rf_clf -- Classifier, sklearn LogisticRegression
    """
    lr_clf = LogisticRegression()
    lr_clf.fit (X_train, y_train)
    print(lr_clf.score (X_test, y_test))
    
    return lr_clf

In [185]:
lr_clf = logisticRegressionClassifier(X_train, y_train, X_test, y_test)

0.8033707865168539


In [186]:
def svmClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a support vector machine classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    svm_clf -- Classifier, sklearn SVC
    """
    svm_clf = sk.svm.SVC(probability=True)
    svm_clf.fit (X_train, y_train)
    print(svm_clf.score (X_test, y_test))
    
    return svm_clf

In [187]:
svm_clf = svmClassifier(X_train, y_train, X_test, y_test) 

0.8089887640449438


In [188]:
def naiveBayesClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a gaussian naive bayes classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    svm_clf -- Classifier, sklearn GaussianNB
    """
    nb_clf = GaussianNB()
    nb_clf.fit (X_train, y_train)
    print(nb_clf.score (X_test, y_test))
    
    return nb_clf

In [189]:
nb_clf = naiveBayesClassifier(X_train, y_train, X_test, y_test)

0.7303370786516854


In [190]:
def kNeighborsClassifier(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a k-nearest neighbors classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    svm_clf -- Classifier, sklearn KNeighborsClassifier
    """
    knn_clf = KNeighborsClassifier(n_neighbors=6)
    knn_clf.fit (X_train, y_train)
    print(knn_clf.score (X_test, y_test))
    
    return knn_clf

In [191]:
knn_clf = kNeighborsClassifier(X_train, y_train, X_test, y_test)

0.8370786516853933


In [192]:
def ensambleVotingClassifier(clfs, X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains an ensamble of classifiers which them vote together

    Arguments:
    clfs -- Classifiers with labels, List Tuple(String, clf)
    
    Returns:
    e_clf -- Classifier, sklearn VotingClassifier
    """
    
    e_clf = ske.VotingClassifier(estimators=clfs, voting='hard') # Hard voting where majority rules
    e_clf.fit (X_train, y_train)
    
    for label, clf in clfs:
        scores = cross_val_score(clf, X_test, y_test, cv=20, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))    
    
    scores = cross_val_score(e_clf, X_test, y_test, cv=20, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'Voting Ensamble')) 
    
    return e_clf    

In [193]:
clfs = ([('Decision Tree', dt_clf), ('Random Forest', rf_clf), ('Gradiant Boosting', gb_clf), 
         ('Logistic Regression', lr_clf), ('SVM', svm_clf), ('Naive Bayes', nb_clf), ('K-Nearest', knn_clf)])

e_clf = ensambleVotingClassifier(clfs, X_train, y_train, X_test, y_test)

Accuracy: 0.81 (+/- 0.10) [Decision Tree]
Accuracy: 0.80 (+/- 0.10) [Random Forest]
Accuracy: 0.78 (+/- 0.12) [Gradiant Boosting]
Accuracy: 0.80 (+/- 0.11) [Logistic Regression]
Accuracy: 0.81 (+/- 0.10) [SVM]
Accuracy: 0.76 (+/- 0.16) [Naive Bayes]
Accuracy: 0.79 (+/- 0.12) [K-Nearest]
Accuracy: 0.80 (+/- 0.12) [Voting Ensamble]


In [194]:
def predict(df, clf, export_path):
    """
    Description:
    Makes predictions X -> y and exports to csv

    Arguments:
    df -- Data to predict from, pandas DataFrame
    clf -- classifier, Classifier, sklearn classifier object
    export_path -- Path and name of file, String
        
    Returns:
    df_pred -- prediction, pandas DataFrame
    """
    
    # Extract Ids
    y1 = df['PassengerId'].values
    
    # Make predictions
    df_processed = preprocess_dataframe(df, prediction_data=True, print_info=False)  
    X = df_processed.values
    y2 = clf.predict(X)
    
    # Combine ids and predictions
    y = np.column_stack((y1, y2))
    
    # Restore pandas df
    df_pred = pd.DataFrame(y)
    df_pred.columns = ["PassengerId", "Survived"]
    
    # Export
    df_pred.to_csv(export_path, sep=',', index=False)
    
    return df_pred

In [195]:
df_pred = pd.read_csv('C:/GitHub/kaggle/titanic/data/test.csv', sep=',', header=0)
df_pred = predict(df_pred, rf_clf, 'C:/GitHub/kaggle/titanic/predictions/predictions_random_forest.csv')
df_pred.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
