In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [None]:
#main_df = pd.read_csv('../input/titanicdataset-traincsv/train.csv')
main_df = pd.read_csv('../input/titanic/train.csv')
unmodified_df = main_df
test_df = pd.read_csv('../input/titanic/test.csv')
main_df.sample(5)

In [None]:
#FORMATTING TRAINING SET
main_df.columns=['Passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked']

#FORMATTING TESTING SET
test_df.columns=['Passengerid','pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked']

main_df.info()

In [None]:
for dataframe in [main_df, test_df]:
    label_status = LabelEncoder()

    dataframe.loc[:,'contains_mr'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['mr','mister']))
    dataframe.loc[:,'contains_mrs'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['mrs']))
    dataframe.loc[:,'contains_ms'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['ms','miss','mlle','mme']))
    dataframe.loc[:,'contains_master'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['master']))
    dataframe.loc[:,'contains_sir'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['sir','jonkheer','col','major','don']))
    dataframe.loc[:,'contains_rev'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['rev','reverend']))
    dataframe.loc[:,'contains_lady'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['lady','dona','the countess']))
    dataframe.loc[:,'contains_dr'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['dr','doctor']))
    dataframe.loc[:,'contains_col'] = dataframe.loc[:,'name'].str.lower().str.contains('|'.join(['col']))


    dataframe['contains_mr'] = label_status.fit_transform(dataframe['contains_mr'])
    dataframe['contains_mrs'] = label_status.fit_transform(dataframe['contains_mrs'])
    dataframe['contains_ms'] = label_status.fit_transform(dataframe['contains_ms'])
    dataframe['contains_master'] = label_status.fit_transform(dataframe['contains_master'])
    dataframe['contains_sir'] = label_status.fit_transform(dataframe['contains_sir'])
    dataframe['contains_rev'] = label_status.fit_transform(dataframe['contains_rev'])
    dataframe['contains_lady'] = label_status.fit_transform(dataframe['contains_lady'])
    dataframe['contains_dr'] = label_status.fit_transform(dataframe['contains_dr'])
    dataframe['contains_col'] = label_status.fit_transform(dataframe['contains_col'])

In [None]:
main_df['cabin']

In [None]:
main_df.info()

In [None]:
for dataset in [main_df, test_df]:

    # Feature Engineering

    # Joining all family members together
    dataset['family'] = dataset.loc[:,'sibsp'] + dataset.loc[:,'parch'] + 1

    # Filling missing fare values and assigning var to missing fare
    dataset['age'] = dataset.loc[:,'age'].fillna(dataset['age'].median()+.01)
    dataset['missing_age'] = dataset.loc[:,'age']==28.01
    dataset['fare'] = dataset.loc[:,'fare'].fillna(dataset['fare'].median())
    dataset['cabin_pp'] = dataset['cabin'].fillna('x')
    dataset['cabin_pp'] = dataset.loc[:,'cabin_pp'].apply(lambda x: x[0])
    dataset['missing_cabin'] = dataset.loc[:,'cabin_pp']=='x'
    dataset['embarked'] = dataset['embarked'].fillna('S')
    dataset['age'] = dataset['age'].fillna(dataset['age'].median())
    dataset['ticket_cn'] = np.where(dataset.ticket != '1601',0,1)
    
    dataset.loc[:,'no_fam'] = dataset.loc[:,'family'].astype(int)==1
    dataset.loc[:,'fam_less_than_4'] = ((dataset.loc[:,'family'].astype(int)>1) & (dataset.loc[:,'family'].astype(int)<4))
    dataset.loc[:,'fam_greater_than_4'] = dataset.loc[:,'family'].astype(int)>4
    
    # converts range into simple encoded 0,1 scalar
    label_status = LabelEncoder()
    dataset['sex'] = label_status.fit_transform(dataset['sex'])
    dataset['embarked'] = label_status.fit_transform(dataset['embarked'])
    dataset['cabin_pp'] = label_status.fit_transform(dataset['cabin_pp'])
    dataset['missing_age'] = label_status.fit_transform(dataset['missing_age'])
    dataset['missing_cabin'] = label_status.fit_transform(dataset['missing_cabin'])
    dataset['ticket_cn'] = label_status.fit_transform(dataset['ticket_cn'])

    dataset['no_fam'] = label_status.fit_transform(dataset['no_fam'])
    dataset['fam_less_than_4'] = label_status.fit_transform(dataset['fam_less_than_4'])
    dataset['fam_greater_than_4'] = label_status.fit_transform(dataset['fam_greater_than_4'])
    
main_df.sample(5)

In [None]:
corr = main_df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
test_df.sample(5)

In [None]:
for dataset in [main_df, test_df]:
    dataset = dataset.select_dtypes('number').dropna() 

y = main_df['survived']
X = main_df.drop(['survived','sibsp','parch','name','cabin','ticket',], axis=1)
test_df_sample = test_df.drop(['name','sibsp','parch','cabin','ticket',], axis=1)

# TRAIN TEST SPLIT

In [None]:
#split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=(2**32 - 1))

In [None]:
#weights of int/floats important
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier(criterion='entropy',max_depth=3,)
dtc.fit(X_train,y_train)
pred_dtc = dtc.predict(X_test)
#Lets see how they preformed!
print(classification_report(y_test, pred_dtc))
print(confusion_matrix(y_test, pred_dtc))
accuracy = dtc.score(X_test, y_test)
print(f'Decision Tree Classifier Accuracy: {accuracy}')

# Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200, criterion='entropy') #best for medium-sized datasets # max_depth=10
rfc.fit(X_train, y_train)

# TEST THE TRAINING DATA
pred_rfc_train = rfc.predict(X_train)
pred_rfc = rfc.predict(X_test)
pred_rfc_final = rfc.predict(test_df_sample)
#Lets see how they preformed!
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))
accuracy = rfc.score(X_test, y_test)
print(f'Random Forest Classifier Accuracy: {accuracy}')

In [None]:
pred_rfc_final = rfc.predict(test_df_sample)
submission = pd.DataFrame({'PassengerId':test_df['Passengerid'],'Survived':pred_rfc_final})
submission.to_csv('./horrigan_submission_rfc_v2.csv')

In [None]:
submission

# Logistic Regression

In [None]:
lrc = LogisticRegression()
lrc.fit(X_train,y_train)
pred_lrc = lrc.predict(X_test)
pred_lrc_final = lrc.predict(test_df_sample)
#Lets see how they preformed!
print(classification_report(y_test, pred_lrc))
print(confusion_matrix(y_test, pred_lrc))
accuracy = lrc.score(X_test, y_test)
print(f'Logistic Regression Classifier Accuracy: {accuracy}')

submission = pd.DataFrame({'PassengerId':test_df['Passengerid'],'Survived':pred_lrc_final})
submission.to_csv('./horrigan_submission_lrc.csv')

# SVM Classifier

In [None]:
svm = svm.SVC() # best on smaller numbers
svm.fit(X_train, y_train)
pred_svm=svm.predict(X_test)

test_df_sample = sc.transform(test_df_sample)
pred_svm_final = svm.predict(test_df_sample)   
     
submission = pd.DataFrame({'PassengerId':test_df['Passengerid'],'Survived':pred_svm_final})
submission.to_csv('./horrigan_submission_svm.csv')

print(classification_report(y_test, pred_svm))
print(confusion_matrix(y_test, pred_svm))
accuracy = svm.score(X_test, y_test)
print(f'Support Vector Machine Classifier Accuracy: {accuracy}')

# Neural Networks

In [None]:
mlpc = MLPClassifier(hidden_layer_sizes=(5,5,5),max_iter=500)
mlpc.fit(X_train, y_train)
pred_mlpc=mlpc.predict(X_test)
print(classification_report(y_test, pred_mlpc))
print(confusion_matrix(y_test, pred_mlpc))
accuracy = mlpc.score(X_test, y_test)
print(f'Neural Network Classifier Accuracy: {accuracy}')

In [None]:
from sklearn.metrics import accuracy_score
svm_score = accuracy_score(y_test, pred_svm)
rfc_score = accuracy_score(y_test, pred_rfc)
mlpc_score = accuracy_score(y_test, pred_mlpc)
dtc_score = accuracy_score(y_test,pred_dtc)

print(f'Support Vector Machine Classifier: {svm_score}')
print(f'Random Forest Classifier: {rfc_score}')
print(f'Neural Network Classifier: {mlpc_score}')
print(f'Decision Tree Classifier: {dtc_score}')

In [None]:
import warnings
warnings.filterwarnings("ignore")

models = [
    ('Logistic Regression Classifier:', LogisticRegression()),
    #('Naive Bayes Gaussian NB:', GaussianNB()),
    ('Support Vector Machine Classifier:', SVC()),
    ('KNeighbors Classifier:', KNeighborsClassifier()),
    ('Decision Tree Classifier:', DecisionTreeClassifier()),
    ('Neural Network Classifier',MLPClassifier(hidden_layer_sizes=(10,10,10),max_iter=600)),
    ('Random Forest Classifier',RandomForestClassifier(n_estimators=200, criterion='entropy',)),]

for dataset_name, dataset, label in [('UNMODIFIED',unmodified_df,'survived'),('FEATURE ENGINEERED SET',main_df,'survived')]:
    dataset=dataset.select_dtypes('number').dropna()
    y = np.array(dataset[label])
    X = np.array(dataset.drop(label, axis=1))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=(2**32 - 1))
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    print(dataset_name)
    for name, model in models:
        clf = model
        clf.fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)
        print(name, accuracy)
    
    print('------ BREAK -------')

In [None]:
#model = RandomForestClassifier(n_estimators=100,)

#n_estim = range(100,1000,100)
#criterion = ['entropy','gini']

#param_grid = {"n_estimators" :n_estim,'criterion':criterion}
#model_rfc = GridSearchCV(model,param_grid = param_grid, cv=5, scoring="accuracy", n_jobs=4, verbose = 1)
#model_rfc.fit(X_train,y_train)

# Best score
#print(model_rfc.best_score_)

#best estimator
#model_rfc.best_params_

In [None]:
#model_rfc.best_params_
df = pd.read_csv('./horrigan_submission_svm.csv')
df

In [None]:
# filled NA = extra column telling it is filled
# actual OHE
# tree-based feature engineering can cause more noise