In [107]:
import numpy as np
import pandas
from matplotlib import pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [108]:
real_features = ["Age", "Fare"]
discrete_features = ["SibSp", "Parch", "Pclass"]
cat_features = ["Sex", "Embarked"]
target = "Survived"

In [109]:
data = pandas.read_csv("train.csv", na_values="NaN")
data.Age = data['Age'].fillna(data['Age'].mean())

In [110]:
data.Sex = data.Sex.replace({'female': 1}, regex=True)

In [111]:
data.Sex = data.Sex.replace({'male': 0}, regex=True)

In [112]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [113]:
X = data[discrete_features + ["Sex", "Age"]]

Y = data[target]

In [114]:
X.iloc[[1,2]]

Unnamed: 0,SibSp,Parch,Pclass,Sex,Age
1,1,0,1,1,38.0
2,0,0,3,1,26.0


In [115]:
# Функция для обучения и оценки классификатора
def train_and_estimate_knn(n, train_x, train_y, test_x, test_y):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(train_x, train_y)
    return accuracy_score(test_y, knn.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

In [116]:
from sklearn.model_selection import KFold
n = 5
kf = KFold(n_splits=n)

In [117]:
sum = 0
knn_n = 5
for train, test in kf.split(X):
    sum+= train_and_estimate_knn(knn_n, X.iloc[train], Y.iloc[train] , X.iloc[test], Y.iloc[test])

print "Для KNN = ",  knn_n, ', оценка = ', sum / float(n)

Для KNN =  5 , оценка =  0.767698198481


In [118]:
# Функция для обучения и оценки классификатора
def train_and_estimate_dtc(train_x, train_y, test_x, test_y):
    cart = DecisionTreeClassifier()
    cart.fit(train_x, train_y)
    return accuracy_score(test_y, cart.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

In [119]:
sum = 0
for train, test in kf.split(X):
    sum+= train_and_estimate_dtc(X.iloc[train], Y.iloc[train] , X.iloc[test], Y.iloc[test])

print "Для CART", ', оценка = ', sum / float(n)

Для CART , оценка =  0.798022722993


In [120]:
from sklearn.ensemble import RandomForestClassifier

# Функция для обучения и оценки классификатора
def train_and_estimate_rfc(train_x, train_y, test_x, test_y):
    rfc = RandomForestClassifier(n_estimators = 70)
    rfc.fit(train_x, train_y)
    return accuracy_score(test_y, rfc.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

In [121]:
sum = 0
for train, test in kf.split(X):
    sum+= train_and_estimate_rfc(X.iloc[train], Y.iloc[train] , X.iloc[test], Y.iloc[test])

print "Для RandomForest", ', оценка = ', sum / float(n)

Для RandomForest , оценка =  0.803646977591


In [122]:
from sklearn.model_selection import GridSearchCV

# Функция для обучения и оценки классификатора
def train_and_estimate_gs(train_x, train_y, test_x, test_y):
    rfc = RandomForestClassifier()
    parameters = {'n_estimators':range(70, 90, 2)}
    gs = GridSearchCV(rfc, parameters)
    gs.fit(train_x, train_y)
    #print gs.best_estimator_.n_estimators
    return accuracy_score(test_y, gs.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

In [123]:
sum = 0

for train, test in kf.split(X):
    sum+= train_and_estimate_gs(X.iloc[train], Y.iloc[train] , X.iloc[test], Y.iloc[test])

print "Для GridSearch на основе RandomForest", ', оценка = ', sum / float(n)

Для GridSearch на основе RandomForest , оценка =  0.802523382085
