# Titanic Data Analysis

### Import Data

In [1]:
import pandas as pd
import pylab as P
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error

data = pd.read_csv('Data/titanic_full.csv', header=0)

### Data Cleaning

In [2]:
data['nSex'] = data['sex'].map({'female':0, 'male':1}).astype(int)
data['nEmbarked'] = data['embarked'].map({'S':0, 'C':1, 'Q':2})
data.loc[(data.nEmbarked.isnull())] = 0
data['nFamilySize'] = data['sibsp'] + data['parch']
median_ages = np.zeros((2,3))
for i in range(0,2):
	for j in range(0,3):
		median_ages[i,j] = data[(data['nSex'] == i) & (data['pclass'] == j+1)]['age'].dropna().median()
data['nAge'] = data['age']
for i in range(0,2):
	for j in range(0,3):
		data.loc[(data.age.isnull()) & (data.nSex == i) & (data.pclass == j+1),'nAge'] = median_ages[i,j]
median_fare = np.zeros((1,3))
for i in range(0,1):
	for j in range(0,3):
		median_fare[i,j] = data[(data['pclass'] == j+1) & (data['fare'] != 0)]['age'].median()
data['nFare'] = data['fare']
for i in range(0,1):
	for j in range(0,3):
		data.loc[((data.fare.isnull()) | (data.fare == 0)) & (data.pclass == j+1),'nFare'] = median_fare[i,j]

### Creating Model 1 (Random Forrest)

In [3]:
model1traindata = data[['survived','nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model1traindata = model1traindata.astype(int)
model1traindata = model1traindata.values

model1testdata = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model1testdata = model1testdata.astype(int)
model1testdata = model1testdata.values

model1forest = RandomForestClassifier(n_estimators = 1300)
model1forest = model1forest.fit(model1traindata[0::,1::],model1traindata[0::,0])
model1output = pd.DataFrame(model1forest.predict(model1testdata))
model1output['nSex'] = data[['nSex']]
model1output['nEmbarked'] = data[['nEmbarked']]
model1output['pclass'] = data[['pclass']]

 ### Creating Model 2 (Logistic Regression)

In [4]:
model2traindataX = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model2traindataX = model2traindataX.astype(int)
model2traindataX = model2traindataX.values
model2traindatay = data[['survived']]
model2traindatay = np.ravel(model2traindatay)

model2testdata = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model2testdata = model2testdata.astype(int)
model2testdata = model2testdata.values

model2LogisticRegression = LogisticRegression()
model2LogisticRegression = model2LogisticRegression.fit(model2traindataX,model2traindatay)
model2output = pd.DataFrame(model2LogisticRegression.predict(model2testdata))
model2output['nSex'] = data[['nSex']]
model2output['nEmbarked'] = data[['nEmbarked']]
model2output['pclass'] = data[['pclass']]

### Creating Model 3 (Support Vector Machines)  

In [5]:
model3traindataX = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model3traindataX = model3traindataX.astype(int)
model3traindataX = model3traindataX.values
model3traindatay = data[['survived']]
model3traindatay = np.ravel(model3traindatay)

model3testdata = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model3testdata = model3testdata.astype(int)
model3testdata = model3testdata.values

model3LogisticRegression = SVC()
model3LogisticRegression = model3LogisticRegression.fit(model3traindataX,model3traindatay)
model3output = pd.DataFrame(model3LogisticRegression.predict(model2testdata))
model3output['nSex'] = data[['nSex']]
model3output['nEmbarked'] = data[['nEmbarked']]
model3output['pclass'] = data[['pclass']]

### Model Evaluation

In [6]:
actual = data[['survived']]

model1 = model1output[0]
model2 = model2output[0]
model3 = model3output[0]
ratioTable = pd.DataFrame({'model1': pd.Series([float(0)], index=['Accuracy','F1 Binary','Mean Absolute Error'])})
ratioTable.model1[0] = accuracy_score(actual, model1)
ratioTable.model1[1] = f1_score(actual, model1, average='binary')
ratioTable.model1[2] = mean_absolute_error(actual, model1)
ratioTable['model2'] = float(0)
ratioTable.model2[0] = accuracy_score(actual, model2)
ratioTable.model2[1] = f1_score(actual, model2, average='binary')
ratioTable.model2[2] = mean_absolute_error(actual, model2)
ratioTable['model3'] = float(0)
ratioTable.model3[0] = accuracy_score(actual, model3)
ratioTable.model3[1] = f1_score(actual, model3, average='binary')
ratioTable.model3[2] = mean_absolute_error(actual, model3)

print ratioTable

                       model1    model2    model3
Accuracy             0.945760  0.791444  0.881589
F1 Binary            0.927179  0.711111  0.829857
Mean Absolute Error  0.054240  0.208556  0.118411


### Confusion Matrix

In [7]:
print 'Model 1 Confusion Matrix'
print confusion_matrix(actual['survived'], model1output[0])
print 'Model 2 Confusion Matrix'
print confusion_matrix(actual['survived'], model2output[0])
print 'Model 3 Confusion Matrix'
print confusion_matrix(actual['survived'], model3output[0])

Model 1 Confusion Matrix
[[786  25]
 [ 46 452]]
Model 2 Confusion Matrix
[[700 111]
 [162 336]]
Model 3 Confusion Matrix
[[776  35]
 [120 378]]


### Print to File

In [8]:
predictions_file = open("model1.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model1output))
predictions_file.close()

predictions_file = open("model2.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model2output))
predictions_file.close()

predictions_file = open("model3.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model3output))
predictions_file.close()