# Titanic Data Analysis

### Import Data

In [1]:
# Import Libraries
import pandas as pd
import pylab as P
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt

### Data Cleaning

In [2]:
# Importing Titanic Data
data = pd.read_csv('Data/titanic_full.csv', header=0)

# Converting to numbers - Sex, Family Size, and Embarked
data['nSex'] = data['sex'].map({'female':0, 'male':1})
dummies = pd.get_dummies(data['embarked']).astype(float)
data = pd.concat([data, dummies], axis=1)
data['nFamilySize'] = data['sibsp'] + data['parch']

# To Fill in Age according to Sex and Class
median_ages = np.zeros((2,3))
for i in range(0,2):
	for j in range(0,3):
		median_ages[i,j] = data[(data['nSex'] == i) & (data['pclass'] == j+1)]['age'].dropna().median()
data['nAge'] = data['age']
for i in range(0,2):
	for j in range(0,3):
		data.loc[(data.age.isnull()) & (data.nSex == i) & (data.pclass == j+1),'nAge'] = median_ages[i,j]
median_fare = np.zeros((1,3))

# To Fill in Fare according to Class
for i in range(0,1):
	for j in range(0,3):
		median_fare[i,j] = data[(data['pclass'] == j+1) & (data['fare'] != 0)]['age'].median()
data['nFare'] = data['fare']
for i in range(0,1):
	for j in range(0,3):
		data.loc[((data.fare.isnull()) | (data.fare == 0)) & (data.pclass == j+1),'nFare'] = median_fare[i,j]

### Creating Testing Data

In [3]:
#Comment 
#Split up data set (70 Train, 30 Split - sklearn train test split)
train, test = train_test_split( data, test_size=0.3, random_state=42)

traindata = train[['survived','nSex','S', 'C', 'Q','nFamilySize','nAge','nFare','pclass']]
traindata = traindata.values
traindataX = train[['nSex','S', 'C', 'Q','nFamilySize','nAge','nFare','pclass']]
traindataX = traindataX.values
traindataY = train[['survived']]
traindataY = np.ravel(traindataY)

testdataF = pd.DataFrame(test[['nSex','S', 'C', 'Q','nFamilySize','nAge','nFare','pclass']])
testdata = testdataF.values

### Creating Model 1 (Random Forest)

In [4]:
#Comment 
#Split up data set (70 Train, 30 Split - sklearn train test split)
forest = RandomForestClassifier(n_estimators = 1300)
forest = forest.fit(traindata[0::,1::],traindata[0::,0])
forestOutput = pd.DataFrame(forest.predict(testdata))

 ### Creating Model 2 (Logistic Regression)

In [5]:
logisticRegression = LogisticRegression()
logisticRegression = logisticRegression.fit(traindataX,traindataY)
logisticRegressionOutput = pd.DataFrame(logisticRegression.predict(testdata))

### Creating Model 3 (Support Vector Machines)  

In [6]:
supportVC = SVC()
supportVC = supportVC.fit(traindataX,traindataY)
supportVCOutput = pd.DataFrame(supportVC.predict(testdata))

### Model Evaluation

In [7]:
# Saving the True Surviving Data
actual = test[['survived']]

# Tale showing the different evaluations of each Model
model1 = forestOutput[0]
model2 = logisticRegressionOutput[0]
model3 = supportVCOutput[0]
ratioTable = pd.DataFrame({'Random_Forrest': pd.Series([float(0)], index=['Accuracy','F1 Binary','Mean Absolute Error', 'R^2'])})
ratioTable.Random_Forrest[0] = accuracy_score(actual, model1)
ratioTable.Random_Forrest[1] = f1_score(actual, model1, average='binary')
ratioTable.Random_Forrest[2] = mean_absolute_error(actual, model1)
ratioTable.Random_Forrest[3] = r2_score(actual, model1)
ratioTable['Logistic_Regression'] = float(0)
ratioTable.Logistic_Regression[0] = accuracy_score(actual, model2)
ratioTable.Logistic_Regression[1] = f1_score(actual, model2, average='binary')
ratioTable.Logistic_Regression[2] = mean_absolute_error(actual, model2)
ratioTable.Logistic_Regression[3] = r2_score(actual, model2)
ratioTable['Support_Vector_Machines'] = float(0)
ratioTable.Support_Vector_Machines[0] = accuracy_score(actual, model3)
ratioTable.Support_Vector_Machines[1] = f1_score(actual, model3, average='binary')
ratioTable.Support_Vector_Machines[2] = mean_absolute_error(actual, model3)
ratioTable.Support_Vector_Machines[3] = r2_score(actual, model3)

print ratioTable

                     Random_Forrest  Logistic_Regression  \
Accuracy                   0.788804             0.801527   
F1 Binary                  0.739812             0.740000   
Mean Absolute Error        0.211196             0.198473   
R^2                        0.138340             0.190247   

                     Support_Vector_Machines  
Accuracy                            0.661578  
F1 Binary                           0.523297  
Mean Absolute Error                 0.338422  
R^2                                -0.380732  


### Feature Importance for Forrest

In [12]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(8):
    print("%d. %s (%f)" % (f + 1, testdataF.columns.values[f], importances[indices[f]]))
    
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(8), importances[indices],color="r", yerr=std[indices], align="center")
plt.xticks(range(8), testdataF.columns.values)
plt.xlim([-1, 8])
#plt.show()

Feature ranking:
1. nSex (0.294356)
2. S (0.272687)
3. C (0.234407)
4. Q (0.081855)
5. nFamilySize (0.079585)
6. nAge (0.018361)
7. nFare (0.013006)
8. pclass (0.005742)
None


### Confusion Matrix

In [None]:
print 'Model 1 Confusion Matrix'
print confusion_matrix(actual['survived'], model1output[0])
print 'Model 2 Confusion Matrix'
print confusion_matrix(actual['survived'], model2output[0])
print 'Model 3 Confusion Matrix'
print confusion_matrix(actual['survived'], model3output[0])

### Print to File

In [None]:
predictions_file = open("model1.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model1output))
predictions_file.close()

predictions_file = open("model2.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model2output))
predictions_file.close()

predictions_file = open("model3.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["Survived"])
open_file_object.writerows(zip(model3output))
predictions_file.close()