In [219]:
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
import pandas as pd
import sklearn as sk
import sklearn.ensemble as ske
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [220]:
# Removes a warning in sklearn that will be fixed during an update mid 2018
import warnings

if __name__ == '__main__':
    warnings.filterwarnings(action='ignore', category=DeprecationWarning)
    le = sk.preprocessing.LabelEncoder()
    le.fit([1, 2, 2, 6])
    le.transform([1, 1, 2, 6])
    le.inverse_transform([0, 0, 1, 2])

In [246]:
df = pd.read_csv('C:/GitHub/kaggle/titanic/data/train.csv', sep=',', header=0)
df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [222]:
def substrings_in_string(whole, subs): 
    
    for x in subs:
        if x in str(whole): 
            return x
        
    return 'Unknown'

In [223]:
def extract_title(full_name):
    
    full_name  = str(full_name)
    
    x = full_name.split(", ")
    x = x[1]
    x = x.split('.')
    x = x[0]
    
    return x

In [224]:
def simplify_titles(x):
    
    title=x['Title']
    
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Sir'
    elif title in ['the Countess', 'Mme', 'Dona']:
        return 'Lady'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title == 'Dr':
        if x['Sex'] == 0:
            return 'Mrs'
        else:
            return 'Mr'
    else:
        return title
    

In [225]:
def preprocess_dataframe(df, prediction_data=False, print_info=False):
    """
    Description:
    
    Performs preprocessing on the titanic data
    
    PassengerId - Id (Only available on training data)
    Survived    - Survived  (0 = No; 1 = Yes)
    Pclass      - Passenger Class  (1 = 1st; 2 = 2nd; 3 = 3rd)
    Name        - Name
    Sex         - Sex
    Age         - Age
    Sibsp       - Number of Siblings/Spouses Aboard
    Parch       - Number of Parents/Children Aboard
    Ticket      - Ticket Number
    Fare        - Passenger Fare (British pound)
    Cabin       - Cabin code
    Embarked    - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
    
    Arguments:
    df -- Dataset, Pandas DataFrame
    
    Returns:
    df -- Dataset, Pandas DataFrame
    """

    # Encode sex into binary (0 = male, 1 = female)
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 0})
    
    # Turning Cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck' ]= df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
    
    # Calculate family size
    #df['FamilySize'] = df['SibSp'] + df['Parch']
    
    df['Title']= df['Name'].map(lambda x: extract_title(x))
    if not prediction_data:
        unique_titles = df['Title'].unique()
        survival_by_title = df.groupby('Title').mean()['Survived']
        
    df['Title']=df.apply(simplify_titles, axis=1)
    if not prediction_data:
        unique_titles_simplified = df['Title'].unique()
        survival_by_title_simplified = df.groupby('Title').mean()['Survived']
    

    # Gather info on the significance of these classes for survival
    # Class was, as expected, significance for surival with the rates (1st - 63%, 2nd - 47%, 3rd - 24%)
    # Embarked was suprisingly significant, C - Cherbourg had 55% surivial rate when the mean was just 38%
    if not prediction_data:
        survival_by_plcass = df.groupby('Pclass').mean()['Survived']
        survival_by_deck = df.groupby('Deck').mean()['Survived']
        survival_by_embark = df.groupby('Embarked').mean()['Survived']
    
    # Split classes with one hot encoding
    # Pclass   - splits into (1 = Pclass_1, 2 = Pclass_2, 3 = Pclass_3)
    # Embarked - splits into (C = Embarked_C, Q = Embarked_Q, S = Embarked_S)
    # Deck - splits into decks with letters
    df = pd.get_dummies(df, columns = ['Pclass', 'Embarked', 'Deck', 'Title'])

    # Age has missing values which is replaced with average
    # Might also consider dividing age into classes of age brackets
    df['Age'].fillna((df['Age'].mean()), inplace = True)
    df['Fare'].fillna((df['Fare'].mean()), inplace = True)

    # Drop columns with data deemed not relevant for learning
    # Name     - Gender already has its' own column. Only thing that might be interesting here is the title
    # Ticket   - Ticket does not really say much, price and class are already included which says the most
    # Cabin    - Replaced by Deck
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1) #, 'SibSp', 'Parch'], axis=1)
    
    if prediction_data:
        m = len(df['Age'])
        df['Deck_T'] = pd.Series(np.zeros(m, dtype=int), index=df.index)
    
    # Normalize the data
    norm_vals = ['Age', 'Fare'] #, 'FamilySize']
    df[norm_vals] = (df[norm_vals] - df[norm_vals].min())/(df[norm_vals].max() - df[norm_vals].min())
    
    if print_info:
        if not prediction_data:
            print('--------------------------------------------------------------------------------------')
            print('SURVIVAL RATE')
            print('--------------------------------------------------------------------------------------')
            print('Overall survival rate: ' + str(df['Survived'].mean()))
            print()
            print(survival_by_plcass)
            print()
            print(survival_by_embark)
            print()
            print(survival_by_deck)
            print()
            print(survival_by_title)
            print()
            print(survival_by_title_simplified)
        print('--------------------------------------------------------------------------------------')
        print('TITLES')
        print('--------------------------------------------------------------------------------------')
        print('All titels: ')
        print(unique_titles)
        print()
        print('Simplied titels: ')
        print(unique_titles_simplified)
        print('--------------------------------------------------------------------------------------')
        print('SUMS')
        print('--------------------------------------------------------------------------------------')
        print(df.sum())
        print('--------------------------------------------------------------------------------------')
        print('DATA INFO')
        print('--------------------------------------------------------------------------------------')
        print(df.info())
        print('--------------------------------------------------------------------------------------')
        print('MISSING VALUES')
        print('--------------------------------------------------------------------------------------')
        print(df.isnull().sum())
        print('--------------------------------------------------------------------------------------')
        print('CORRELATIONS')
        print('--------------------------------------------------------------------------------------')
        print(df.corr())
        print('--------------------------------------------------------------------------------------')

    return df

In [226]:
def split_data(df):
    """
    Description:
    Splits the data into test/training set for simple validation

    Arguments:
    df -- Dataset, pandas dataframe
        
    Returns:
    train -- Training samples, pandas dataframe
    test -- Test samples, pandas dataframe
    """

    df_train = df.sample(frac = 0.8, random_state = 42)
    df_test = df.drop(df_train.index)  
    
    X_train = df_train.drop(['Survived'], axis=1).values
    y_train = df_train['Survived'].values
    
    X_test = df_test.drop(['Survived'], axis=1).values
    y_test = df_test['Survived'].values
    
    return X_train, y_train, X_test, y_test

In [227]:
df_processed = preprocess_dataframe(df, False, True)

X_train, y_train, X_test, y_test = split_data(df_processed)

print ("X_train shape: " + str(X_train.shape))
print ("y_train shape: " + str(y_train.shape))
print ("X_test shape: " + str(X_test.shape))
print ("y_test shape: " + str(y_test.shape))

df_processed.head(5)

--------------------------------------------------------------------------------------
SURVIVAL RATE
--------------------------------------------------------------------------------------
Overall survival rate: 0.3838383838383838

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

Deck
A          0.466667
B          0.744681
C          0.593220
D          0.757576
E          0.757576
F          0.583333
G          0.500000
T          0.000000
Unknown    0.299854
Name: Survived, dtype: float64

Title
Capt            0.000000
Col             0.500000
Don             0.000000
Dr              0.428571
Jonkheer        0.000000
Lady            1.000000
Major           0.500000
Master          0.575000
Miss            0.697802
Mlle            1.000000
Mme             1.000000
Mr              0.156673
Mrs             0.792000
Ms              1.000000
Rev             0.000000
Sir    

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,...,Deck_F,Deck_G,Deck_T,Deck_Unknown,Title_Lady,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Sir
0,0,0,0.271174,1,0,0.014151,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,1,1,0.472229,1,0,0.139136,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,1,1,0.321438,0,0,0.015469,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
3,1,1,0.434531,1,0,0.103644,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0.434531,0,0,0.015713,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0


In [228]:
def decision_tree_clf(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a decision tree classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    dt_clf -- Classifier, sklearn DecisionTreeClassifier
    """
    
    dt_clf = sk.tree.DecisionTreeClassifier(max_depth=20)
    dt_clf.fit (X_train, y_train)
    print(dt_clf.score (X_test, y_test))
    
    return dt_clf

In [229]:
dt_clf = decision_tree_clf(X_train, y_train, X_test, y_test)

0.7921348314606742


In [230]:
def random_forest_clf(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a random forest classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    rf_clf -- Classifier, sklearn RandomForestClassifier
    """
    
    rf_clf = ske.RandomForestClassifier(n_estimators=50)
    rf_clf.fit (X_train, y_train)
    print(rf_clf.score (X_test, y_test))
    
    return rf_clf

In [231]:
rf_clf = random_forest_clf(X_train, y_train, X_test, y_test) 

0.8370786516853933


In [232]:
def gradient_boosting_clf(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a gradient boosting classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    gb_clf -- Classifier, sklearn GradiantBoostingClassifier
    """
    
    gb_clf = ske.GradientBoostingClassifier(n_estimators=50)
    gb_clf.fit (X_train, y_train)
    print(gb_clf.score (X_test, y_test))
    
    return gb_clf

In [233]:
gb_clf = gradient_boosting_clf(X_train, y_train, X_test, y_test) 

0.8314606741573034


In [234]:
def logistic_regression_clf(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a logistic regression classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    rf_clf -- Classifier, sklearn LogisticRegression
    """
    
    lr_clf = LogisticRegression()
    lr_clf.fit (X_train, y_train)
    print(lr_clf.score (X_test, y_test))
    
    return lr_clf

In [235]:
lr_clf = logistic_regression_clf(X_train, y_train, X_test, y_test)

0.8202247191011236


In [236]:
def support_vector_machine_clf(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a support vector machine classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    svm_clf -- Classifier, sklearn SVC
    """
    
    svm_clf = sk.svm.SVC(probability=True)
    svm_clf.fit (X_train, y_train)
    print(svm_clf.score (X_test, y_test))
    
    return svm_clf

In [237]:
svm_clf = support_vector_machine_clf(X_train, y_train, X_test, y_test) 

0.8314606741573034


In [238]:
def naive_bayes_clf(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a gaussian naive bayes classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    svm_clf -- Classifier, sklearn GaussianNB
    """
    
    nb_clf = GaussianNB()
    nb_clf.fit (X_train, y_train)
    print(nb_clf.score (X_test, y_test))
    
    return nb_clf

In [239]:
nb_clf = naive_bayes_clf(X_train, y_train, X_test, y_test)

0.7303370786516854


In [240]:
def k_neighbors_clf(X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains a k-nearest neighbors classifier

    Arguments:
    X_train -- Training features,numpy matrix (m, 11)
    y_train -- Training features,numpy matrix (m, )
    X_test -- Test features,numpy matrix (m, 11)
    y_test -- Test features,numpy matrix (m, )
        
    Returns:
    svm_clf -- Classifier, sklearn KNeighborsClassifier
    """
    
    knn_clf = KNeighborsClassifier(n_neighbors=6)
    knn_clf.fit (X_train, y_train)
    print(knn_clf.score (X_test, y_test))
    
    return knn_clf

In [241]:
knn_clf = k_neighbors_clf(X_train, y_train, X_test, y_test)

0.8595505617977528


In [242]:
def ensamble_voting_clf(clfs, X_train, y_train, X_test, y_test):
    """
    Description:
    Builds and trains an ensamble of classifiers which them vote together

    Arguments:
    clfs -- Classifiers with labels, List Tuple(String, clf)
    
    Returns:
    e_clf -- Classifier, sklearn VotingClassifier
    """
    
    e_clf = ske.VotingClassifier(estimators=clfs, voting='hard') # Hard voting where majority rules
    e_clf.fit (X_train, y_train)
    
    for label, clf in clfs:
        scores = cross_val_score(clf, X_test, y_test, cv=20, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))    
    
    scores = cross_val_score(e_clf, X_test, y_test, cv=20, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'Voting Ensamble')) 
    
    return e_clf    

In [243]:
clfs = ([('Decision Tree', dt_clf), ('Random Forest', rf_clf), ('Gradiant Boosting', gb_clf), 
         ('Logistic Regression', lr_clf), ('SVM', svm_clf), ('Naive Bayes', nb_clf), ('K-Nearest', knn_clf)])

e_clf = ensamble_voting_clf(clfs, X_train, y_train, X_test, y_test)

Accuracy: 0.78 (+/- 0.11) [Decision Tree]
Accuracy: 0.81 (+/- 0.10) [Random Forest]
Accuracy: 0.81 (+/- 0.12) [Gradiant Boosting]
Accuracy: 0.83 (+/- 0.09) [Logistic Regression]
Accuracy: 0.82 (+/- 0.12) [SVM]
Accuracy: 0.74 (+/- 0.11) [Naive Bayes]
Accuracy: 0.79 (+/- 0.12) [K-Nearest]
Accuracy: 0.82 (+/- 0.08) [Voting Ensamble]


In [244]:
def predict(df, clf, export_path):
    """
    Description:
    Makes predictions X -> y and exports to csv

    Arguments:
    df -- Data to predict from, pandas DataFrame
    clf -- classifier, Classifier, sklearn classifier object
    export_path -- Path and name of file, String
        
    Returns:
    df_pred -- prediction, pandas DataFrame
    """
    
    # Extract Ids
    y1 = df['PassengerId'].values
    
    # Make predictions
    df_process = preprocess_dataframe(df, prediction_data=True, print_info=False) 
    X = df_process.values
    y2 = clf.predict(X)
    
    # Combine ids and predictions
    y = np.column_stack((y1, y2))
    
    # Restore pandas df
    df_pred = pd.DataFrame(y)
    df_pred.columns = ["PassengerId", "Survived"]
    
    # Export
    df_pred.to_csv(export_path, sep=',', index=False)
    
    return df_pred

In [245]:
df_pred = pd.read_csv('C:/GitHub/kaggle/titanic/data/test.csv', sep=',', header=0)
df_pred = predict(df_pred, lr_clf, 'C:/GitHub/kaggle/titanic/predictions/predictions_logistic_regression.csv')
df_pred.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
