In [150]:
import numpy as np
import pandas
from matplotlib import pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [151]:
real_features = ["Age", "Fare"]
discrete_features = ["SibSp", "Parch", "Pclass"]
cat_features = ["Sex", "Embarked"]
target = "Survived"

In [152]:
data = pandas.read_csv("train.csv", na_values="NaN")
data.Age = data['Age'].fillna(data['Age'].mean())

In [153]:
data.Sex = data.Sex.replace({'female': 1}, regex=True)

In [154]:
data.Sex = data.Sex.replace({'male': 0}, regex=True)

In [155]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [156]:
X = data[discrete_features + ["Sex", "Age"]]

Y = data[target]

In [157]:
X.iloc[[1,2]]

Unnamed: 0,SibSp,Parch,Pclass,Sex,Age
1,1,0,1,1,38.0
2,0,0,3,1,26.0


In [158]:
# Функция для обучения и оценки классификатора
def train_and_estimate_knn(n, train_x, train_y, test_x, test_y):
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(train_x, train_y)
    return accuracy_score(test_y, knn.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

In [159]:
from sklearn.model_selection import KFold
n = 5
kf = KFold(n_splits=n)

In [160]:
sum = 0
knn_n = 5
for train, test in kf.split(X):
    sum+= train_and_estimate_knn(knn_n, X.iloc[train], Y.iloc[train] , X.iloc[test], Y.iloc[test])
print "Титаник"
print "KNN = ",  knn_n, ', оценка = ', sum / float(n)

Титаник
KNN =  5 , оценка =  0.767698198481


In [161]:
from sklearn.datasets import load_digits
digits = load_digits()  # загружаем датасет ирисов Фишера
X = digits.data  # Массив признаков
Y = digits.target  # Массив желаемых откликов

In [162]:
X?

In [163]:
# Функция для обучения и оценки классификатора
def train_and_estimate_dtc(train_x, train_y, test_x, test_y):
    cart = DecisionTreeClassifier()
    cart.fit(train_x, train_y)
    return accuracy_score(test_y, cart.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

In [164]:
sum = 0
for train, test in kf.split(X):
    sum+= train_and_estimate_dtc(X[train], Y[train] , X[test], Y[test])
print "Digits"
print "CART", ', оценка = ', sum / float(n)

Digits
CART , оценка =  0.786915815537


In [165]:
from sklearn.datasets import load_boston
boston = load_boston()  # загружаем датасет ирисов Фишера
X = boston.data  # Массив признаков
Y = boston.target  # Массив желаемых откликов

In [166]:
Y.mean()

22.532806324110677

In [167]:
print boston.DESCR

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [174]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Функция для обучения и оценки классификатора
def train_and_estimate_rfc(train_x, train_y, test_x, test_y):
    rfc = RandomForestRegressor(n_estimators = 90, random_state=42)
    rfc.fit(train_x, train_y)
    return mean_absolute_error(test_y, rfc.predict(test_x))  # Возвращаем долю верных классификаций на тестовой выборке

In [175]:
sum = 0
for train, test in kf.split(X):
    sum+= train_and_estimate_rfc(X[train], Y[train] , X[test], Y[test])
print "Boston house"
print "RandomForest", ', оценка = ', sum / float(n)

Boston house
RandomForest , оценка =  3.07842464246


In [176]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Функция для обучения и оценки классификатора
def train_and_estimate_gs(train_x, train_y, test_x, test_y):
    rfc = RandomForestRegressor(random_state=42)
    parameters = {'n_estimators':range(100, 200, 10)}
    gs = GridSearchCV(rfc, parameters)
    gs.fit(train_x, train_y)
    return mean_absolute_error(test_y, gs.predict(test_x))

In [177]:
sum = 0

for train, test in kf.split(X):
    sum+= train_and_estimate_gs(X[train], Y[train] , X[test], Y[test])

print "Boston house"
print "GridSearch на основе RandomForest", ', оценка = ', sum / float(n)

Boston house
GridSearch на основе RandomForest , оценка =  3.02376236349
