In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC

## Index
* [search best C](#search-best-C)
* [train again with whole data](#train-again-with-whole-data)
* [save predictor](#save-predictor)
* [generate submission](#generate-submission)

### search best C

In [15]:
train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
ytrain = train_df["Survived"]

feature_names = ["Pclass","Age","SibSp","Parch","Fare","IsMale","Ticket-4digit","Ticket-5digit","Ticket-6digit"]
Xtrain = train_df[feature_names]

In [16]:
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)

In [17]:
svc = LinearSVC(dual=False)
Cs = np.logspace(-4,4)

# cannot use "n_jobs=-1", because multiprocessing cannot run within IPython interactive environment under windows
searchcv = GridSearchCV(estimator=svc, param_grid=dict(C = Cs),n_jobs=1,cv=10)
searchcv.fit(Xtrain_scaled,ytrain)    

GridSearchCV(cv=10, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   1.45635e-04,   2.12095e-04,   3.08884e-04,
         4.49843e-04,   6.55129e-04,   9.54095e-04,   1.38950e-03,
         2.02359e-03,   2.94705e-03,   4.29193e-03,   6.25055e-03,
         9.10298e-03,   1.32571e-02,   1.93070e-02,   2.81177e-02,
         4.0949...    1.52642e+03,   2.22300e+03,   3.23746e+03,   4.71487e+03,
         6.86649e+03,   1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [18]:
searchcv.best_score_    

0.80134680134680136

In [19]:
bestc = searchcv.best_params_["C"]
bestc

0.013257113655901081

### train again with whole data

In [20]:
svc = LinearSVC(C=bestc,dual=False)
svc.fit(Xtrain_scaled,ytrain)

LinearSVC(C=0.013257113655901081, class_weight=None, dual=False,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
     tol=0.0001, verbose=0)

In [21]:
svc.score(Xtrain_scaled,ytrain)

0.80134680134680136

### save predictor

In [22]:
import common
common.dump_predictor("svc.pkl",svc)

### generate submission

In [23]:
test_df = pd.read_csv("test_processed.csv",index_col="PassengerId")
Xtest = test_df[feature_names]

Xtest_scaled = scaler.transform(Xtest)
predictions = svc.predict(Xtest_scaled)

common.make_submission(Xtest.index,predictions,"submit_svc.csv")