In [139]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [140]:
# загрузка данных
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [141]:
# посмотрим на данные
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [142]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [143]:
df_train.shape

(891, 12)

In [144]:
# посомтрим на пропуски
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [145]:
# выбросим 2 записи, у которых нет Embarked
df_train = df_train[df_train['Embarked'].notnull()]

In [146]:
# объединим тестовую выборку и трейн, чтобы вместе предобработать
cnt = len(df_train)
y_train = df_train['Survived']
df_train.drop('Survived', axis=1, inplace=True)

df_train = pd.concat([df_train, df_test], axis=0)

In [147]:
# есть идея проверить и создать интересную фичу из имени и фамилии - там есть, как я понимаю, девечьи имена женщин, 
# поставим 1 если такое есть имя в скобочках и 0 если нет, а если пол мужской поставим -1
df_train['Married'] = np.where((df_train['Name'].str.contains('\\(')) & (df_train['Name'].str.contains('\\)')), 1, 0)
df_train['Married'] = np.where(df_train['Sex'] == 'male', -1, df_train['Married'])

In [148]:
# есть предположение, что билеты с буквами отличаются от простых числовых
df_train['Ticket_with_digits'] = np.where(df_train['Ticket'].str.isdigit(), 1, 0)

In [149]:
# отметим там, где есть номер кабины
df_train['Cabin_exist'] = np.where(df_train['Cabin'].notnull(), 1, 0)

In [150]:
# удалим из рассмотрения столбец с id пассажира, так как он не отражает связи с какими либо данными - просто порядковый номер
# удалим также обработанные столбцы ранее
df_train.drop('PassengerId', axis=1, inplace = True)
df_train.drop('Ticket', axis=1, inplace = True)
df_train.drop('Cabin', axis=1, inplace = True)
df_train.drop('Name', axis=1, inplace = True)

In [151]:
# разберемся с категориальными признаками
df_train = pd.get_dummies(df_train, columns=['Sex', 'Embarked'])

In [152]:
# проверим возраст на адекватность
df_train[(df_train.Age < 1) | (df_train.Age > 100)]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Married,Ticket_with_digits,Cabin_exist,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
78,2,0.83,0,2,29.0,-1,1,0,0.0,1.0,0.0,0.0,1.0
305,1,0.92,1,2,151.55,-1,1,1,0.0,1.0,0.0,0.0,1.0
469,3,0.75,2,1,19.2583,0,1,0,1.0,0.0,1.0,0.0,0.0
644,3,0.75,2,1,19.2583,0,1,0,1.0,0.0,1.0,0.0,0.0
755,2,0.67,1,1,14.5,-1,1,0,0.0,1.0,0.0,0.0,1.0
803,3,0.42,0,1,8.5167,-1,1,0,0.0,1.0,1.0,0.0,0.0
831,2,0.83,1,1,18.75,-1,1,0,0.0,1.0,0.0,0.0,1.0
201,3,0.33,0,2,14.4,-1,1,0,0.0,1.0,0.0,0.0,1.0
250,2,0.92,1,2,27.75,0,0,0,1.0,0.0,0.0,0.0,1.0
281,3,0.75,1,1,13.775,-1,0,0,0.0,1.0,0.0,0.0,1.0


In [153]:
#  заменим все значения меньше нуля 1
df_train.loc[df_train.Age < 1, 'Age'] = 1

In [154]:
# заменим все значения меньше единицы 1 на медиану
df_train.loc[df_train.Fare < 1, 'Fare'] = df_train.Fare.mean()

In [155]:
# запомним имена колонок
columns_names = df_train.columns

In [156]:
# заменим все NaN средними
from sklearn.preprocessing import Imputer
fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=1)
df_train = fill_NaN.fit_transform(df_train)

In [157]:
def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=None)
    return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')


In [158]:
X_train = df_train[0:cnt]
X_test= df_train[cnt:]

In [159]:
?GridSearchCV

In [160]:
gb_param_grid =  {'learning_rate': [0.1, 0.05, 0.02],
            'max_depth': [4, 6, 8],
            'min_samples_leaf': range(2, 10),
            'max_features': ['auto', 'sqrt', 'log2'],
            'n_estimators': [10, 20, 50, 100, 200],
            }

gb_gridsearch = GridSearchCV(GradientBoostingClassifier(random_state=10767), gb_param_grid, 
                             scoring='accuracy', cv=5, n_jobs=4)
gb_gridsearch.fit(X_train, y_train)
gb_gridsearch.best_params_

{'learning_rate': 0.1,
 'max_depth': 6,
 'max_features': 'auto',
 'min_samples_leaf': 6,
 'n_estimators': 50}

In [161]:
rf_param_grid = { 
    'n_estimators': [10, 20, 50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf_gridsearch = GridSearchCV(RandomForestClassifier(random_state=10767), rf_param_grid, 
                             scoring='accuracy', cv=5)
rf_gridsearch.fit(X_train, y_train)
rf_gridsearch.best_params_

{'max_features': 'auto', 'n_estimators': 200}

In [162]:
dt_param_grid =  {'splitter': ['random', 'best'],
              'max_depth': [4, 6, 8, 10, 12],
              'min_samples_leaf': range(2, 10),
              'max_features': ['auto', 'sqrt', 'log2'] 
            
              }

dt_gridsearch = GridSearchCV(DecisionTreeClassifier(random_state=10767), dt_param_grid,
                             scoring='accuracy', cv=5)
dt_gridsearch.fit(X_train, y_train)
dt_gridsearch.best_params_

{'max_depth': 12,
 'max_features': 'auto',
 'min_samples_leaf': 6,
 'splitter': 'best'}

In [176]:
from sklearn.linear_model import LogisticRegression

lr_param_grid = { 
    'C': [0.1, 0.05, 0.02, 0.01]
}

lr_gridsearch = GridSearchCV(LogisticRegression(random_state=10767), lr_param_grid, scoring='accuracy', cv=5)
lr_gridsearch.fit(X_train, y_train)
lr_gridsearch.best_params_

{'C': 0.05}

In [192]:
gb_estimator = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 6, max_features='auto',
                                          min_samples_leaf = 6, n_estimators=50, random_state=10767)

rf_estimator = RandomForestClassifier(max_features='auto', n_estimators=200, random_state=10767)

dt_estimator = DecisionTreeClassifier(max_depth = 12, max_features = 'auto', min_samples_leaf = 6, 
                                      splitter = 'best', random_state=10767)
lr_estimator = LogisticRegression(C=0.05)

In [193]:
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
dt_train_pred = cross_val_predict_proba(dt_estimator, X_train, y_train)
lr_train_pred = cross_val_predict_proba(lr_estimator, X_train, y_train)

In [200]:
X_train_stack = np.stack([rf_train_pred[:,1], gb_train_pred[:,1], 
#                           dt_train_pred[:,1],
                         lr_train_pred[:,1]], axis=1)

In [201]:
# получаем предсказания ансамблей для тестовой выборки
gb_test_pred = gb_estimator.fit(X_train, y_train).predict_proba(X_test)
rf_test_pred = rf_estimator.fit(X_train, y_train).predict_proba(X_test)
dt_test_pred = dt_estimator.fit(X_train, y_train).predict_proba(X_test)
lr_test_pred = lr_estimator.fit(X_train, y_train).predict_proba(X_test)

In [202]:
X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1], 
#                          dt_test_pred[:,1],
                         lr_test_pred[:,1]], axis=1)

In [203]:
from sklearn.linear_model import LogisticRegression

# TODO: подобрать гиперпараметры LogisticRegression

logreg = LogisticRegression().fit(X_train_stack, y_train)
predicted = logreg.predict(X_test_stack)

In [204]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(df_test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))