In [88]:
from __future__ import division
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

In [89]:
data_train = pd.DataFrame.from_csv('train.csv')
data_test = pd.DataFrame.from_csv('test.csv')

In [90]:
display(data_train.head())
print(data_train.describe())
print(data_train.isnull().sum())

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


         Survived      Pclass         Age       SibSp       Parch        Fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64


In [91]:
data_train_X = data_train.ix[: ,1:]
data_train_y = data_train['Survived']

In [92]:
display(data_train_X.head())

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [93]:
def split_cabin_string(x):
    if type(x) == type('str'):
        return x[0], x[1:]
    else:
        return x, x
    
def preprocess_data(df):
    df.drop(['Name', 'Ticket'], axis=1, inplace=True)
    df['Cabin_1'] = df.apply(lambda row: split_cabin_string(row['Cabin'])[0], axis=1)
    
    pclass_encoded = pd.get_dummies(df['Pclass'], prefix='Pclass')
    sex_encoded = pd.get_dummies(df['Sex'], prefix='Sex')
    cabin_encoded = pd.get_dummies(df['Cabin_1'], prefix='Cabin')
    embarked_encoded = pd.get_dummies(df['Embarked'], prefix='Embark')
    
    df = df.join(pclass_encoded).join(sex_encoded).join(cabin_encoded).join(embarked_encoded)
    df.drop(['Pclass', 'Sex', 'Cabin', 'Cabin_1', 'Embarked'], axis=1, inplace=True)
    
    df['Age'].loc[df['Age'].isnull()] = df['Age'].mean()
    
    return df

data_train_X = preprocess_data(data_train_X)

In [94]:
data_test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [96]:
data_test = preprocess_data(data_test)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

reg = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10, 100]}
reg_gs = GridSearchCV(reg, parameters, cv=9)
reg_gs.fit(data_train_X, data_train_y)
y_pred = reg_gs.predict(data_test)

In [None]:
print("Best score: {}".format(reg_gs.best_score))
print("Best params: {}".format(reg_gs.best_params_)