In [1]:
import numpy as np
import pandas as pd
import sklearn.linear_model as lm
import sklearn.svm as svm
import sklearn.neighbors as nb
import sklearn.cross_validation as cv
import sklearn.grid_search as gs
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn
import matplotlib  # ipymd-skip
matplotlib.rcParams['figure.dpi'] = matplotlib.rcParams['savefig.dpi'] = 144  # ipymd-skip

In [2]:
train = pd.read_csv('data/titanic.csv')  # On charge le fichier CSV.
train.head(3)  # On affiche les 3 premières lignes.

In [3]:
data = train[['Sex', 'Age', 'Pclass', 'Survived']]
data['Sex'] = data['Sex'] == 'female'  # On convertit en booléen.
data = data.dropna()  # On enlève les données manquantes.

In [4]:
data_np = data.astype(np.int32).values  # On convertit en tableau NumPy.
X = data_np[:,:-1]  # On récupère les samples
y = data_np[:,-1]  # et la cible.

In [5]:
(X_train, X_test, y_train, y_test) = cv.train_test_split(X, y, test_size=.05)

In [6]:
def eval_model(model):
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    name = model.__class__.__name__
    score = model.score(X_test, y_test)
    print(name)
    print('-' * len(name))
    print("   Actual:", y_test[:20])
    print("Predicted:", y_predicted[:20])
    print("Score: {0:.3f}".format(score))
    print()
    return model

In [7]:
models = [
    nb.KNeighborsClassifier(),
    svm.LinearSVC(),
    lm.LogisticRegression(),
    ]

In [8]:
for model in models:
    eval_model(model)

KNeighborsClassifier
--------------------
   Actual: [0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0]
Predicted: [0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 1 0]
Score: 0.833

LinearSVC
---------
   Actual: [0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0]
Predicted: [0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0]
Score: 0.833

LogisticRegression
------------------
   Actual: [0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0]
Predicted: [0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0]
Score: 0.778

In [9]:
model = eval_model(gs.GridSearchCV(lm.LogisticRegression(),
                                   {'C': np.logspace(-2., 2., 100)}))

GridSearchCV
------------
   Actual: [0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0]
Predicted: [0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0]
Score: 0.778

In [10]:
model.best_params_

{'C': 1.831}