In [1]:
import pandas
import numpy as np

In [2]:
# Data loading
train_dataframe = pandas.read_csv('train.csv')

In [3]:
train_dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def prepr(dataframe):
    # Not needed: 'PassengerId', 'Name'
    # Not yet treated: 'Ticket', 'Cabin'
    # Features: 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
    # Labels: 'Survived'
    
    # Sex
    mp = {'male': 1, 'female': -1}
    dataframe['Sex'] = train_dataframe['Sex'].map(lambda x : mp.get(x) if x in mp else x)
    
    # Embarked
    mp = {'C': -1, 'S': 0, 'Q': 1}
    dataframe['Embarked'] = train_dataframe['Embarked'].map(lambda x : mp.get(x) if x in mp else x)
    
    # need to find NaN
    dataframe.fillna(0,inplace=True)
    
    return dataframe[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [5]:
X_train_orig = prepr(train_dataframe).as_matrix()
Y_train = train_dataframe['Survived'].as_matrix().ravel()

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train_orig)
X_train = scaler.transform(X_train_orig)

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split

param_grid = [{'kernel': ['rbf'], 'C': [10, 100, 1000], 'gamma' : [1e-2, 1e-3, 1e-4]},
              {'kernel': ['linear'], 'C': [1, 10, 100]}]

X_train_a, X_train_v, Y_train_a, Y_train_v = train_test_split(X_train, Y_train)
gcv = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=4)
gcv.fit(X_train_a, Y_train_a)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid=[{'kernel': ['rbf'], 'C': [10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001]}, {'kernel': ['linear'], 'C': [1, 10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
mu = gcv.cv_results_['mean_test_score']
std = gcv.cv_results_['std_test_score']
pars = gcv.cv_results_['params']
for (m,s,p) in zip(mu,std,pars):
    print('%.2f (+-%.2f): %s' %(m,s,p))

0.80 (+-0.04): {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.79 (+-0.04): {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.74 (+-0.01): {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.81 (+-0.04): {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.79 (+-0.04): {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.79 (+-0.04): {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.81 (+-0.03): {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.80 (+-0.04): {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.79 (+-0.04): {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.79 (+-0.04): {'C': 1, 'kernel': 'linear'}
0.79 (+-0.04): {'C': 10, 'kernel': 'linear'}
0.79 (+-0.04): {'C': 100, 'kernel': 'linear'}


In [9]:
pred = gcv.predict(X_train_v)

In [10]:
from sklearn.metrics import classification_report, accuracy_score
print('hyperparams: %s' % gcv.best_params_)
print(classification_report(pred,Y_train_v))
print('accuracy %.2f' % accuracy_score(pred,Y_train_v))

hyperparams: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
             precision    recall  f1-score   support

          0       0.92      0.80      0.86       155
          1       0.65      0.84      0.73        68

avg / total       0.84      0.81      0.82       223

accuracy 0.81
