In [21]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors
from sklearn.cross_validation import train_test_split
from tqdm import tqdm
import csv
import itertools

# 様々な条件でテストして総合的によかったものを保存する

In [22]:
# 標準化
def norm(x, axis = None):
    xmean = x.mean(axis=axis)
    xstd = np.std(x, axis=axis)
    z = (x - xmean)/xstd
    return z

In [23]:
# いろんな条件で見られるように関数化
def test_knn(k=10, test_size=0.4, random_state=0):
    train_data = pd.read_csv("data/train.csv")
    train_data = train_data[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].dropna()
    titanic_X = train_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
    titanic_y = train_data['Survived']
    train_X, test_X, train_y, test_y = train_test_split(titanic_X, titanic_y, test_size = test_size, random_state=random_state) 

    # 標準化
    train_X = norm(train_X)
    test_X = norm(test_X)
    
    # ベストな条件を見つける
    best_score = 0
    best_clf = None
    best_cond = []
    
    weights = ['uniform', 'distance'] 
    ks = range(1, k+1)
    for w, k in tqdm(itertools.product(weights, ks)):
        clf = neighbors.KNeighborsClassifier(k, weights=w)
        clf.fit(train_X, train_y)
        score = clf.score(test_X, test_y)*100
        
        # ベストを超えたら更新
        if score < best_score:
            continue
        best_score = score
        best_clf = clf
        
    return best_score, best_clf

In [24]:
# テスト用データを使って予測
score, clf = test_knn()
test_data = pd.read_csv("data/test.csv")
test_data['Pclass'].fillna(test_data['Pclass'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['SibSp'].fillna(test_data['SibSp'].median(), inplace=True)
test_data['Parch'].fillna(test_data['Parch'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
test_X = norm(test_data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']], axis=0)
x1 = test_X['Pclass']
x2 = test_X['Age']
x3 = test_X['SibSp']
x4 = test_X['Parch']
x5 = test_X['Fare']

Z = clf.predict(np.c_[x1, x2, x3, x4, x5])

with open('result/knn.csv', 'w') as f:
    wr = csv.writer(f)
    wr.writerow(['PassengerId','Survived'])
    for id, s in zip(test_data['PassengerId'], Z):
         wr.writerow([id, s])


20it [00:00, 474.15it/s]
