# Titanic Data Analysis

### Import Data

In [1]:
import pandas as pd
import pylab as P
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

data = pd.read_csv('Data/titanic_full.csv', header=0)

### Data Cleaning

In [2]:
data['nSex'] = data['sex'].map({'female':0, 'male':1}).astype(int)
data['nEmbarked'] = data['embarked'].map({'S':0, 'C':1, 'Q':2})
data.loc[(data.nEmbarked.isnull())] = 0
data['nFamilySize'] = data['sibsp'] + data['parch']
median_ages = np.zeros((2,3))
for i in range(0,2):
	for j in range(0,3):
		median_ages[i,j] = data[(data['nSex'] == i) & (data['pclass'] == j+1)]['age'].dropna().median()
data['nAge'] = data['age']
for i in range(0,2):
	for j in range(0,3):
		data.loc[(data.age.isnull()) & (data.nSex == i) & (data.pclass == j+1),'nAge'] = median_ages[i,j]
median_fare = np.zeros((1,3))
for i in range(0,1):
	for j in range(0,3):
		median_fare[i,j] = data[(data['pclass'] == j+1) & (data['fare'] != 0)]['age'].median()
data['nFare'] = data['fare']
for i in range(0,1):
	for j in range(0,3):
		data.loc[((data.fare.isnull()) | (data.fare == 0)) & (data.pclass == j+1),'nFare'] = median_fare[i,j]

### Getting Data for Cross Validation

In [3]:
actual = data[['survived','nSex','nEmbarked','pclass']]
ratioTable = pd.DataFrame({'actual': pd.Series([float(0)], index=['overall','men','women','class 1','class 2','class 3','embarked S', 'embarked C', 'embarked Q'])})
ratioTable.actual[0] = float(data.survived.sum())/data.shape[0]
ratioTable.actual[1] = float(data[(data['nSex'] == 1)]['survived'].sum())/data.shape[0]
ratioTable.actual[2] = float(data[(data['nSex'] == 0)]['survived'].sum())/data.shape[0]
ratioTable.actual[3] = float(data[(data['pclass'] == 1)]['survived'].sum())/data.shape[0]
ratioTable.actual[4] = float(data[(data['pclass'] == 2)]['survived'].sum())/data.shape[0]
ratioTable.actual[5] = float(data[(data['pclass'] == 3)]['survived'].sum())/data.shape[0]
ratioTable.actual[6] = float(data[(data['nEmbarked'] == 0)]['survived'].sum())/data.shape[0]
ratioTable.actual[7] = float(data[(data['nEmbarked'] == 1)]['survived'].sum())/data.shape[0]
ratioTable.actual[8] = float(data[(data['nEmbarked'] == 2)]['survived'].sum())/data.shape[0]

### Creating Model 1 (Random Forrest)

In [4]:
model1traindata = data[['survived','nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model1traindata = model1traindata.astype(int)
model1traindata = model1traindata.values

model1testdata = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model1testdata = model1testdata.astype(int)
model1testdata = model1testdata.values

model1forest = RandomForestClassifier(n_estimators = 1300)
model1forest = model1forest.fit(model1traindata[0::,1::],model1traindata[0::,0])
model1output = pd.DataFrame(model1forest.predict(model1testdata))
model1output['nSex'] = data[['nSex']]
model1output['nEmbarked'] = data[['nEmbarked']]
model1output['pclass'] = data[['pclass']]

 ### Creating Model 2 (Logistic Regression)

In [5]:
model2traindataX = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model2traindataX = model2traindataX.astype(int)
model2traindataX = model2traindataX.values
model2traindatay = data[['survived']]
model2traindatay = np.ravel(model2traindatay)

model2testdata = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model2testdata = model2testdata.astype(int)
model2testdata = model2testdata.values

model2LogisticRegression = LogisticRegression()
model2LogisticRegression = model2LogisticRegression.fit(model2traindataX,model2traindatay)
model2output = pd.DataFrame(model2LogisticRegression.predict(model2testdata))
model2output['nSex'] = data[['nSex']]
model2output['nEmbarked'] = data[['nEmbarked']]
model2output['pclass'] = data[['pclass']]

### Creating Model 3 (Support Vector Machines)  

In [6]:
model3traindataX = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model3traindataX = model3traindataX.astype(int)
model3traindataX = model3traindataX.values
model3traindatay = data[['survived']]
model3traindatay = np.ravel(model3traindatay)

model3testdata = data[['nSex','nEmbarked','nFamilySize','nAge','nFare','pclass']]
model3testdata = model3testdata.astype(int)
model3testdata = model3testdata.values

model3LogisticRegression = SVC()
model3LogisticRegression = model3LogisticRegression.fit(model3traindataX,model3traindatay)
model3output = pd.DataFrame(model3LogisticRegression.predict(model2testdata))
model3output['nSex'] = data[['nSex']]
model3output['nEmbarked'] = data[['nEmbarked']]
model3output['pclass'] = data[['pclass']]

### Cross Validation

In [7]:
ratioTable['model1'] = float(0)
ratioTable.model1[0] = float(model1output[0].sum())/model1output.shape[0]
ratioTable.model1[1] = float(model1output[(model1output['nSex'] == 1)][0].sum())/model1output.shape[0]
ratioTable.model1[2] = float(model1output[(model1output['nSex'] == 0)][0].sum())/model1output.shape[0]
ratioTable.model1[3] = float(model1output[(model1output['pclass'] == 1)][0].sum())/model1output.shape[0]
ratioTable.model1[4] = float(model1output[(model1output['pclass'] == 2)][0].sum())/model1output.shape[0]
ratioTable.model1[5] = float(model1output[(model1output['pclass'] == 3)][0].sum())/model1output.shape[0]
ratioTable.model1[6] = float(model1output[(model1output['nEmbarked'] == 0)][0].sum())/model1output.shape[0]
ratioTable.model1[7] = float(model1output[(model1output['nEmbarked'] == 1)][0].sum())/model1output.shape[0]
ratioTable.model1[8] = float(model1output[(model1output['nEmbarked'] == 2)][0].sum())/model1output.shape[0]

ratioTable['model2'] = float(0)
ratioTable.model2[0] = float(model2output[0].sum())/model2output.shape[0]
ratioTable.model2[1] = float(model2output[(model2output['nSex'] == 1)][0].sum())/model2output.shape[0]
ratioTable.model2[2] = float(model2output[(model2output['nSex'] == 0)][0].sum())/model2output.shape[0]
ratioTable.model2[3] = float(model2output[(model2output['pclass'] == 1)][0].sum())/model2output.shape[0]
ratioTable.model2[4] = float(model2output[(model2output['pclass'] == 2)][0].sum())/model2output.shape[0]
ratioTable.model2[5] = float(model2output[(model2output['pclass'] == 3)][0].sum())/model2output.shape[0]
ratioTable.model2[6] = float(model2output[(model2output['nEmbarked'] == 0)][0].sum())/model2output.shape[0]
ratioTable.model2[7] = float(model2output[(model2output['nEmbarked'] == 1)][0].sum())/model2output.shape[0]
ratioTable.model2[8] = float(model2output[(model2output['nEmbarked'] == 2)][0].sum())/model2output.shape[0]

ratioTable['model3'] = float(0)
ratioTable.model3[0] = float(model3output[0].sum())/model3output.shape[0]
ratioTable.model3[1] = float(model3output[(model3output['nSex'] == 1)][0].sum())/model3output.shape[0]
ratioTable.model3[2] = float(model3output[(model3output['nSex'] == 0)][0].sum())/model3output.shape[0]
ratioTable.model3[3] = float(model3output[(model3output['pclass'] == 1)][0].sum())/model3output.shape[0]
ratioTable.model3[4] = float(model3output[(model3output['pclass'] == 2)][0].sum())/model3output.shape[0]
ratioTable.model3[5] = float(model3output[(model3output['pclass'] == 3)][0].sum())/model3output.shape[0]
ratioTable.model3[6] = float(model3output[(model3output['nEmbarked'] == 0)][0].sum())/model3output.shape[0]
ratioTable.model3[7] = float(model3output[(model3output['nEmbarked'] == 1)][0].sum())/model3output.shape[0]
ratioTable.model3[8] = float(model3output[(model3output['nEmbarked'] == 2)][0].sum())/model3output.shape[0]

print ratioTable

              actual    model1    model2    model3
overall     0.380443  0.361345  0.341482  0.315508
men         0.122995  0.091673  0.017571  0.072574
women       0.257448  0.269672  0.323911  0.242934
class 1     0.151261  0.146677  0.126050  0.145913
class 2     0.090909  0.090909  0.080214  0.074102
class 3     0.138273  0.123759  0.133690  0.095493
embarked S  0.232238  0.212376  0.198625  0.177998
embarked C  0.114591  0.112299  0.097785  0.103896
embarked Q  0.033613  0.036669  0.045073  0.033613


### Confusion Matrix

In [8]:
print 'Model 1 Confusion Matrix'
print confusion_matrix(actual['survived'], model1output[0])
print 'Model 2 Confusion Matrix'
print confusion_matrix(actual['survived'], model2output[0])
print 'Model 3 Confusion Matrix'
print confusion_matrix(actual['survived'], model3output[0])

Model 1 Confusion Matrix
[[788  23]
 [ 48 450]]
Model 2 Confusion Matrix
[[700 111]
 [162 336]]
Model 3 Confusion Matrix
[[776  35]
 [120 378]]


### Print to File

In [9]:
predictions_file = open("model1.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model1output))
predictions_file.close()

predictions_file = open("model2.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model2output))
predictions_file.close()

predictions_file = open("model3.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model3output))
predictions_file.close()