## Контест на предсказание выживших пассажиров в Титанике

Это контест на бинарную классификацию со сложными признаками.

https://www.kaggle.com/c/titanic

In [160]:
# стандартные импорты
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# 1) Один раз качаем данные

In [161]:
train_data = pd.read_csv('train.csv', index_col='PassengerId')
test_data = pd.read_csv('test.csv', index_col='PassengerId')

In [162]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [163]:
test_data.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [167]:
X = train_data.drop(columns=['Survived'])
y = train_data['Survived']

X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [168]:
y.head()

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

# 2) Один раз поделим train_data на train и test части

Теперь мы будем работать с X, y чтобы подобрать хорошую модель, а потом применим ее на test_data и отправим на Kaggle.

Дальше есть выбор:

* Можно один раз поделить train_data на две части
* Можно использовать кросс-валидацию, то есть делить выборку на 10 частей, каждую исопльзовать как тестовую и усреднить результат.

Для второго варианта вначале делать ничего не надо, для первого - надо поделить на тестовую и тренировочную.

Второй вариант лучше, но работает дольше (в 10 раз). Давайте на всякий случай все-таки поделим, если вы пользоваться не будете, ну и ладно.

In [169]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(596, 10)
(295, 10)
(596,)
(295,)


После этих двух частей у нас готовы X, y, X_train, y_train, X_test, y_test, test_data.

# 3) Выбираем лучшую модель
Это самая сложная и самая важная часть.

Какие вещи тут можно делать:

1) выбирать разные модели (лог регрессия, деревья, бустинги и так далее)

2) инжинирить фичи: добавлять новые, удалять плохие

# Попытка 1
Сделаем самое простое - используем лог регрессию и только пол. Это можно будет использовать как бейзлайн.

In [170]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

Для изменения признаков удобно написать функцию transform. Почему? Потому что потом такое же изменение признаков нам придется применить и к test_data, и можно будет просто вызвать функцию.

In [171]:
# Используем LabelEncoder, чтобы male/female превратить в 0/1

from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()

# пример, как это работает
label_enc.fit_transform(['male', 'female', 'male', 'male', 'unknown'])

array([1, 0, 1, 1, 2])

In [172]:
# просто вернем только пол
def transform(X):
    new_X = pd.DataFrame(index=X.index) # индексы копируем
    new_X['Sex'] = label_enc.fit_transform(X['Sex']) # кладем только столбец пол
    return new_X

transform(X_train).head()

Unnamed: 0_level_0,Sex
PassengerId,Unnamed: 1_level_1
7,1
719,1
686,1
74,1
883,0


In [173]:
# обучаем модель на преобразованныз признаках

model.fit(transform(X_train), y_train)
y_pred = model.predict(transform(X_test))

In [174]:
# оказывается, мы просто сказали, что все девушки выжили, а парни умерли
zip(X_test['Sex'], y_pred)[:10]

[('male', 0),
 ('male', 0),
 ('male', 0),
 ('female', 1),
 ('female', 1),
 ('female', 1),
 ('female', 1),
 ('male', 0),
 ('female', 1),
 ('female', 1)]

In [175]:
from sklearn.metrics import accuracy_score

# смотрим точность
accuracy_score(y_pred, y_test)

0.79661016949152541

Итак, самая примитивная модель набирает почти 79.6% accuracy. Довольно круто.

Давайте ее отправим на Kaggle и убедимся, что там тоже примернео столько же. См. последний раздел 4) Отправляем на Kaggle.

# Попытка 2
Давайте немного улучшим бейзлайн: добавим возраст и добьемся увеличения результата. 

In [304]:
# Используем Imputer, чтобы избавиться от NaN

from sklearn.preprocessing import Imputer
mean_imputer = Imputer(strategy="mean") # заменяем на среднее

# пример, как это работает
mean_imputer.fit_transform(np.array([2, 4, 2, np.nan, 6, 8, np.nan]).reshape(-1, 1))
# reshape нужен, потому что imputer работает только с двумерными массивами :(

array([[ 2. ],
       [ 4. ],
       [ 2. ],
       [ 4.4],
       [ 6. ],
       [ 8. ],
       [ 4.4]])

In [305]:
# пол + возраст
def transform(X):
    new_X = pd.DataFrame(index=X.index) 
    new_X['Sex'] = label_enc.fit_transform(X['Sex'])
    new_X['Age'] = mean_imputer.fit_transform(X['Age'].reshape(-1, 1))
    return new_X

transform(X_train).head()

Unnamed: 0_level_0,Sex,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
7,1,54.0
719,1,29.525983
686,1,25.0
74,1,26.0
883,0,22.0


In [306]:
# обучаем все ту же лог регрессию и проверяем точность

model = LogisticRegression()
model.fit(transform(X_train), y_train)
y_pred = model.predict(transform(X_test))
accuracy_score(y_pred, y_test)

0.79661016949152541

Ничего не изменилось, давайте попробуем поменять коэффициент регуляризации логрегрессии.

In [307]:
for c in np.exp(np.linspace(-10, 15, 20)):
    model = LogisticRegression(C=c)
    model.fit(transform(X_train), y_train)
    y_pred = model.predict(transform(X_test))
    print('C = {:13.5f}, accuracy = {}'.format(c, accuracy_score(y_pred, y_test)))

C =       0.00005, accuracy = 0.593220338983
C =       0.00017, accuracy = 0.593220338983
C =       0.00063, accuracy = 0.593220338983
C =       0.00235, accuracy = 0.593220338983
C =       0.00877, accuracy = 0.593220338983
C =       0.03268, accuracy = 0.796610169492
C =       0.12181, accuracy = 0.796610169492
C =       0.45408, accuracy = 0.796610169492
C =       1.69268, accuracy = 0.796610169492
C =       6.30981, accuracy = 0.796610169492
C =      23.52103, accuracy = 0.796610169492
C =      87.67916, accuracy = 0.796610169492
C =     326.84096, accuracy = 0.796610169492
C =    1218.36266, accuracy = 0.796610169492
C =    4541.68166, accuracy = 0.796610169492
C =   16929.99374, accuracy = 0.796610169492
C =   63109.81473, accuracy = 0.796610169492
C =  235253.99815, accuracy = 0.796610169492
C =  876954.62078, accuracy = 0.796610169492
C = 3269017.37247, accuracy = 0.796610169492


Не получилось, давайте попробуем поменять логрегрессию на RandomForest

In [308]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(transform(X_train), y_train)
y_pred = model.predict(transform(X_test))
accuracy_score(y_pred, y_test)

0.79322033898305089

In [309]:
for n in [1, 5, 10, 20, 50, 100, 200]:
    model = RandomForestClassifier(n_estimators=n)
    model.fit(transform(X_train), y_train)
    y_pred = model.predict(transform(X_test))
    print('Trees = {:13.5f}, accuracy = {}'.format(n, accuracy_score(y_pred, y_test)))

Trees =       1.00000, accuracy = 0.759322033898
Trees =       5.00000, accuracy = 0.786440677966
Trees =      10.00000, accuracy = 0.779661016949
Trees =      20.00000, accuracy = 0.789830508475
Trees =      50.00000, accuracy = 0.789830508475
Trees =     100.00000, accuracy = 0.783050847458
Trees =     200.00000, accuracy = 0.783050847458


In [310]:
from sklearn.ensemble import GradientBoostingClassifier
for n in [1, 5, 10, 20, 50, 100, 200]:
    model = GradientBoostingClassifier(n_estimators=n)
    model.fit(transform(X_train), y_train)
    y_pred = model.predict(transform(X_test))
    print('Trees = {:13.5f}, accuracy = {}'.format(n, accuracy_score(y_pred, y_test)))

Trees =       1.00000, accuracy = 0.593220338983
Trees =       5.00000, accuracy = 0.708474576271
Trees =      10.00000, accuracy = 0.8
Trees =      20.00000, accuracy = 0.8
Trees =      50.00000, accuracy = 0.813559322034
Trees =     100.00000, accuracy = 0.813559322034
Trees =     200.00000, accuracy = 0.786440677966


Увеличивается незначительно, но давайте попробуем GradientBoostingClassifier с 100 деревьями на этих двух признаках.

# Попытка 3

Видим, что возраст как число помогает плохо. Можно попытаться заменить его на четыре категории: дети, взрослые, пожилые, неизвестно.

In [311]:
# Используем OneHotEncoder, чтобы превратить категориальные признаки в несколько числовых 0/1 признаков

from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder(sparse=False)

# пример, как это работает
oh_enc.fit_transform(label_enc.fit_transform(['child', 'adult', 'child', 'child', np.nan]).reshape(-1, 1))

array([[ 0.,  1.,  0.],
       [ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [312]:
from copy import copy

# пол + категории возраста
def transform(X):
    sex = pd.DataFrame(label_enc.fit_transform(X['Sex']), index=X.index)
    
    age_categories = copy(X['Age'])
    age_categories[X['Age'].isnull()] = 'Age: NaN'
    age_categories[X['Age'] < 16.0] = 'Age: child'
    age_categories[X['Age'] > 60.0] = 'Age: old'
    age_categories[(X['Age'] >= 16.0) & (X['Age'] <= 60.0)] = 'Age: adult'
    age = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(age_categories.values).reshape(-1, 1)), index=X.index)
        
    new_X = pd.concat([age, sex], axis=1)
    return new_X

transform(X_train).head(10)

Unnamed: 0_level_0,0,1,2,3,0
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,0.0,1.0,0.0,0.0,1
719,1.0,0.0,0.0,0.0,1
686,0.0,1.0,0.0,0.0,1
74,0.0,1.0,0.0,0.0,1
883,0.0,1.0,0.0,0.0,0
329,0.0,1.0,0.0,0.0,0
454,0.0,1.0,0.0,0.0,1
146,0.0,1.0,0.0,0.0,1
235,0.0,1.0,0.0,0.0,1
221,0.0,1.0,0.0,0.0,1


И давайте вместо проверки на X_test будем делать кросс-валидацию - это дольше, но точнее дает понять, мы улучшаем алгоритм, или нам везет.

In [313]:
from sklearn.model_selection import cross_val_score

cross_val_score(GradientBoostingClassifier(n_estimators=5), transform(X), y, cv=10)

array([ 0.71111111,  0.73333333,  0.76404494,  0.79775281,  0.79775281,
        0.71910112,  0.76404494,  0.68539326,  0.76404494,  0.76136364])

In [314]:
cross_val_score(GradientBoostingClassifier(n_estimators=5), transform(X), y, cv=10).mean()

0.74979429122687558

In [315]:
for n in [1, 5, 10, 20, 50, 100, 200, 500]:
    score = cross_val_score(GradientBoostingClassifier(n_estimators=n), transform(X), y, cv=10).mean()
    print('Trees = {:13.5f}, accuracy = {}'.format(n, score))

Trees =       1.00000, accuracy = 0.616170128249
Trees =       5.00000, accuracy = 0.749794291227
Trees =      10.00000, accuracy = 0.782253716945
Trees =      20.00000, accuracy = 0.778882930428
Trees =      50.00000, accuracy = 0.78337731245
Trees =     100.00000, accuracy = 0.78337731245
Trees =     200.00000, accuracy = 0.78337731245
Trees =     500.00000, accuracy = 0.78337731245


Улучшений не видно, топчемся на месте. Но давайте попробуем заслать.

# Попытка 4

Давайте придумаем как аккуратно загрузить все признаки.

In [316]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 10 columns):
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 116.6+ KB


In [317]:
X.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


* Pclass - класс пассажира возьмем как OneHot Encoding
* Name - оттуда интересно извлечь, есть ли титул, но мы пока просто добавим длину имени по количеству слов
* пол - уже учли категориальную бинарную фичу
* возраст - уже учли как 4 категории
* номер билета - выкидываем
* стоимость билета - берем как фичу
* номер каюты - здесь интересно посмотреть подробнее, но пока просто извлечем оттуда число и букву
* порт отправки - берем как OneHotEncoding

In [318]:
import re

def extract_cabin_features(cabin):
    if cabin is np.nan:
        return None, None
    for i in range(len(cabin) + 1):
        try:
            number = int(cabin[i:])
            return cabin[:i], number
        except:
            pass
    return None, None
        
extract_cabin_features('C136')

('C', 136)

In [339]:
# все фичи
def transform(X):
    sex = pd.DataFrame(label_enc.fit_transform(X['Sex']), index=X.index)
    
    age_categories = copy(X['Age'])
    age_categories[X['Age'].isnull()] = 'Age: NaN'
    age_categories[X['Age'] < 16.0] = 'Age: child'
    age_categories[X['Age'] > 60.0] = 'Age: old'
    age_categories[(X['Age'] >= 16.0) & (X['Age'] <= 60.0)] = 'Age: adult'
    age = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(age_categories.values).reshape(-1, 1)), index=X.index)
        
    pclass = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(X['Pclass']).reshape(-1, 1)), index=X.index)
    
    name = pd.DataFrame([len(name.split()) for name in X['Name']], index=X.index)
    
    fare = pd.DataFrame(mean_imputer.fit_transform(X['Fare'].reshape(-1, 1)), index=X.index)
    
    cabin = pd.DataFrame([extract_cabin_features(cabin) for cabin in X['Cabin']], index=X.index)
    cabin[0] = oh_enc.fit_transform(label_enc.fit_transform(cabin[0]).reshape(-1, 1))
    cabin[1] = cabin[1].replace(np.nan, 0)
        
    embarked = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(X['Embarked']).reshape(-1, 1)), index=X.index)
    
    new_X = pd.concat([age, sex, pclass, name, fare, cabin, embarked], axis=1)
    return new_X

transform(X_train).head(10)

Unnamed: 0_level_0,0,1,2,3,0,0,1,2,0,0,0,1,0,1,2,3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7,0.0,1.0,0.0,0.0,1,1.0,0.0,0.0,4,51.8625,0.0,46.0,0.0,0.0,0.0,1.0
719,1.0,0.0,0.0,0.0,1,0.0,0.0,1.0,3,15.5,1.0,0.0,0.0,0.0,1.0,0.0
686,0.0,1.0,0.0,0.0,1,0.0,1.0,0.0,5,41.5792,1.0,0.0,0.0,1.0,0.0,0.0
74,0.0,1.0,0.0,0.0,1,0.0,0.0,1.0,3,14.4542,1.0,0.0,0.0,1.0,0.0,0.0
883,0.0,1.0,0.0,0.0,0,0.0,0.0,1.0,4,10.5167,1.0,0.0,0.0,0.0,0.0,1.0
329,0.0,1.0,0.0,0.0,0,0.0,0.0,1.0,7,20.525,1.0,0.0,0.0,0.0,0.0,1.0
454,0.0,1.0,0.0,0.0,1,1.0,0.0,0.0,4,89.1042,0.0,92.0,0.0,1.0,0.0,0.0
146,0.0,1.0,0.0,0.0,1,0.0,1.0,0.0,4,36.75,1.0,0.0,0.0,0.0,0.0,1.0
235,0.0,1.0,0.0,0.0,1,0.0,1.0,0.0,5,10.5,1.0,0.0,0.0,0.0,0.0,1.0
221,0.0,1.0,0.0,0.0,1,0.0,0.0,1.0,4,8.05,1.0,0.0,0.0,0.0,0.0,1.0


In [340]:
for n in [1, 5, 10, 20, 50, 100, 200, 500]:
    score = cross_val_score(GradientBoostingClassifier(n_estimators=n), transform(X), y, cv=10).mean()
    print('Trees = {:13.5f}, accuracy = {}'.format(n, score))

Trees =       1.00000, accuracy = 0.616170128249
Trees =       5.00000, accuracy = 0.791343491091
Trees =      10.00000, accuracy = 0.81148081943
Trees =      20.00000, accuracy = 0.804764498922
Trees =      50.00000, accuracy = 0.81042021337
Trees =     100.00000, accuracy = 0.829459198729
Trees =     200.00000, accuracy = 0.831656168426
Trees =     500.00000, accuracy = 0.82269237317


In [378]:
for n in [1, 5, 10, 20, 50, 100, 200, 500]:
    score = cross_val_score(RandomForestClassifier(n_estimators=n), transform(X), y, cv=10).mean()
    print('Trees = {:13.5f}, accuracy = {}'.format(n, score))

Trees =       1.00000, accuracy = 0.765425036886
Trees =       5.00000, accuracy = 0.792366643968
Trees =      10.00000, accuracy = 0.814889910339
Trees =      20.00000, accuracy = 0.81369112473
Trees =      50.00000, accuracy = 0.810358075133
Trees =     100.00000, accuracy = 0.815950800136
Trees =     200.00000, accuracy = 0.819334354784
Trees =     500.00000, accuracy = 0.811494155033


Вроде с таким числом данных градиентный бустинг на 200 деревьях должен улучшить результат.

# Попытка 5

# Попытка 6

# Попытка 7

## 4) Отправляем на Kaggle

Этот раздел надо выполнять перед каждой посылкой.

Нам нужно выбрать модель и функцию transform.

Запустите одну из следующих ячеек:

In [341]:
# из Попытки 1

model = LogisticRegression()
def transform(X):
    new_X = pd.DataFrame(index=X.index)
    new_X['Sex'] = label_enc.fit_transform(X['Sex'])
    return new_X

In [342]:
# из Попытки 2

model = GradientBoostingClassifier(n_estimators=100)
def transform(X):
    new_X = pd.DataFrame(index=X.index) 
    new_X['Sex'] = label_enc.fit_transform(X['Sex'])
    new_X['Age'] = imputer.fit_transform(X['Age'].reshape(-1, 1))
    return new_X

In [343]:
# из Попытки 3

model = GradientBoostingClassifier(n_estimators=100)
def transform(X):
    sex = pd.DataFrame(label_enc.fit_transform(X['Sex']), index=X.index)
    
    age_categories = copy(X['Age'])
    age_categories[X['Age'].isnull()] = 'Age: NaN'
    age_categories[X['Age'] < 16.0] = 'Age: child'
    age_categories[X['Age'] > 60.0] = 'Age: old'
    age_categories[(X['Age'] >= 16.0) & (X['Age'] <= 60.0)] = 'Age: adult'
    age = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(age_categories.values).reshape(-1, 1)), index=X.index)
        
    new_X = pd.concat([age, sex], axis=1)
    return new_X

In [379]:
# из Попытки 4

model = GradientBoostingClassifier(n_estimators=200)
def transform(X):
    sex = pd.DataFrame(label_enc.fit_transform(X['Sex']), index=X.index)
    
    age_categories = copy(X['Age'])
    age_categories[X['Age'].isnull()] = 'Age: NaN'
    age_categories[X['Age'] < 16.0] = 'Age: child'
    age_categories[X['Age'] > 60.0] = 'Age: old'
    age_categories[(X['Age'] >= 16.0) & (X['Age'] <= 60.0)] = 'Age: adult'
    age = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(age_categories.values).reshape(-1, 1)), index=X.index)
        
    pclass = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(X['Pclass']).reshape(-1, 1)), index=X.index)
    
    name = pd.DataFrame([len(name.split()) for name in X['Name']], index=X.index)
    
    fare = pd.DataFrame(mean_imputer.fit_transform(X['Fare'].reshape(-1, 1)), index=X.index)
    
    cabin = pd.DataFrame([extract_cabin_features(cabin) for cabin in X['Cabin']], index=X.index)
    cabin[0] = oh_enc.fit_transform(label_enc.fit_transform(cabin[0]).reshape(-1, 1))
    cabin[1] = cabin[1].replace(np.nan, 0)
        
    embarked = pd.DataFrame(oh_enc.fit_transform(label_enc.fit_transform(X['Embarked']).reshape(-1, 1)), index=X.index)
    
    new_X = pd.concat([age, sex, pclass, name, fare, cabin, embarked], axis=1)
    return new_X

Сюда можно и нужно дописывать новые модели и преобразования данных:

In [380]:
# преобразовываем все данные
transformed_data = transform(pd.concat([X, test_data]))

In [381]:
transformed_X = transformed_data[transformed_data.index <= len(X)]
transformed_test_data = transformed_data[transformed_data.index > len(X)]

In [382]:
# обучаем модель на всех данных
model.fit(transformed_X, y)
y_pred = model.predict(transformed_test_data)

In [383]:
# получаем итоговую таблицу
results = pd.DataFrame({'PassengerId': test_data.index, 'Survived': y_pred})

results.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0


In [384]:
results.to_csv('results.csv',  index=False) 

Идеи для новых попыток:

* Нагенерить полиномиальные и другие нелинейные признаки
* Отобрать ненужные фичи
* Попробовать побольше разных алгоритмов
* Научиться извлекать титул из имени
* Разобраться с зависимостью номера каюты от смертности, может быть там можно понять, на каком борту каюта? Можно поискать карту Титаника
* Добавить признаки Sibsp и Parch