In [13]:
import numpy as np
import pandas as pd
from titanic_data_cleanup import cleanup
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.linear_model import SGDClassifier as SGD
from sklearn import cross_validation

## Data cleanup

In [2]:
#Load Data
df=pd.read_csv('train_data_cleaned.csv')
df_test=cleanup('test.csv')
df.pop('Unnamed: 0')
df.pop('PassengerId')
df.head(3)

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,Gender,AgeFill,AgeIsNull
0,0,3,1,0,7.25,1,22.0,0
1,1,1,1,0,71.2833,0,38.0,0
2,1,3,0,0,7.925,0,26.0,0


In [3]:
df_test.head(3)

Unnamed: 0,PassengerId,Pclass,SibSp,Parch,Fare,Gender,AgeFill,AgeIsNull
0,892,3,0,0,7.8292,1,34.5,0
1,893,3,1,0,7.0,0,47.0,0
2,894,2,0,0,9.6875,1,62.0,0


In [5]:
# Convert data into a usable form
train_X=df.values[:,1:]
train_y=df['Survived'].values
test_X=df_test.values[:,1:]
test_y=np.zeros(len(test_X),dtype=int)
print(train_X[:5])
print(train_y[:5])
print(test_X[:5])
print(test_y[:5])

[[  3.       1.       0.       7.25     1.      22.       0.    ]
 [  1.       1.       0.      71.2833   0.      38.       0.    ]
 [  3.       0.       0.       7.925    0.      26.       0.    ]
 [  1.       1.       0.      53.1      0.      35.       0.    ]
 [  3.       0.       0.       8.05     1.      35.       0.    ]]
[0 1 1 1 0]
[[  3.       0.       0.       7.8292   1.      34.5      0.    ]
 [  3.       1.       0.       7.       0.      47.       0.    ]
 [  2.       0.       0.       9.6875   1.      62.       0.    ]
 [  3.       0.       0.       8.6625   1.      27.       0.    ]
 [  3.       1.       1.      12.2875   0.      22.       0.    ]]
[0 0 0 0 0]


In [6]:
print(train_X.shape) # relatively small dataset -> batch training

(891, 7)


## Run Logistic Regression

In [7]:
# logistic regression with l2(ridge) regularization
clf_LR = LR(C=1.0).fit(train_X,train_y)

#Parameter tuning via grid search
best_score = clf_LR.score(train_X,train_y)
best_C = 1
C_values = [2**i for i in range(-20,20)]
for current_C in C_values:
    clf_LR = LR(C=current_C).fit(train_X,train_y)
    # new score
    current_score =\
    cross_validation.cross_val_score(clf_LR,train_X,train_y,cv=5).mean()
    if best_score <= current_score: # if improved, proceed
        best_C = current_C
        best_score = current_score


# Final training set accuracy
print('training set accuracy:',best_score)

#Test set accuracy
pred_test = clf_LR.predict(test_X)
print('test set accuracy:'+str(np.mean(pred_test==test_y)))

('training set accuracy:', 0.80022446689113358)
test set accuracy:0.623501199041


## Run Random Forest

In [11]:
# Fit the model with default option
clf_RF = RF(n_estimators=100)
clf_RF = clf_RF.fit(train_X, train_y)

#Training set accuracy
score_train = cross_validation.cross_val_score(clf_RF,train_X,train_y,cv=5).mean()
print('training set accuracy:'+str(score_train))

#Test set accuracy
pred_test = clf_RF.predict(test_X)
print('test set accuracy:'+str(np.mean(pred_test==test_y)))

training set accuracy:0.819394372696
test set accuracy:0.649880095923


## Run SVM

### 1. Linear SVM

In [10]:
clf_SVM_linear = SVC(kernel='linear')

#Parameter tuning via grid search
best_score = clf_SVM_linear.score(train_X,train_y)
best_C = 1
C_values = [2**i for i in range(-20,20)]
for current_C in C_values:
    clf_LR = LR(C=current_C).fit(train_X,train_y)
    # new score
    current_score =\
    cross_validation.cross_val_score(clf_LR,train_X,train_y,cv=5).mean()
    if best_score <= current_score: # if improved, proceed
        best_C = current_C
        best_score = current_score

score_train = cross_validation.cross_val_score(clf_SVM_linear,train_X,train_y,cv=5).mean()
print('training set accuracy:'+str(score_train))

#Test set prediction
pred_test = clf_SVM_linear.predict(test_X)
print('test set accuracy:'+str(np.mean(pred_test==test_y)))

training set accuracy:0.786756453423
test set accuracy:0.635491606715


In [11]:
#cross validation
#Learning curve
#parameter tuning

### 2. Kernel SVM

In [12]:
clf_SVM_RBF = SVC(kernel='rbf')
clf_SVM_RBF = clf_SVM_RBF.fit(train_X, train_y)

#Training set accuracy
pred_train = clf_SVM_RBF.predict(train_X)
print('training set accuracy:'+str(np.mean(pred_train==train_y)))

#Test set prediction
pred_test = clf_SVM_RBF.predict(test_X)
print('training set accuracy:'+str(np.mean(pred_test==test_y)))

training set accuracy:0.895622895623
training set accuracy:0.62829736211
