In [356]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [357]:
df = pd.read_csv("titanic.csv")

# Полундра! Космический корабль Титаник (какая ирония), столкнулся с пространственно-временной аномалией. Часть пассажиров была телепортирована в другое измерение.

Ваша задача - построить модель (любую, можете посоревноваться промеж себя кто круче), которая бы предсказывала судьбу пассажира, основываясь на данных, извлечённых с бортового компьютера космического корабля. И проверить адекватность модели при помощи k-fold кросс-валидации.


**PassengerId** - ID пассажира в формате gggg_pp; Где gggg - номер группы, pp - индекс пассажира.
**HomePlanet** - Родная планета пассажира
**CryoSleep** - Показывает, находился ли пассажир в криокамере
**Cabin** - Кабина пассажира в формате палуба / номер / сторона
**Destination** - Планета, на которую пассажир направлялся
**Age** - Возраст
**VIP** - Показывает, был ли пассажир блатным
**RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - количество кредитов, потраченных на соответствующий сервис на корабле
**Name** - Имя пассажира
**Transported** - Целевой признак.

In [358]:
# Посмотрим на перемешанный df
df.sample(frac=1, ignore_index=True)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,2411_01,Earth,False,F/493/P,TRAPPIST-1e,29.0,False,19.0,2115.0,17.0,,0.0,Brinez Howenters,True
1,1584_01,Earth,False,F/314/P,TRAPPIST-1e,57.0,False,433.0,0.0,0.0,371.0,0.0,Paulia Thony,False
2,3854_01,Earth,True,G/632/P,TRAPPIST-1e,33.0,False,0.0,0.0,0.0,0.0,0.0,Racey Carsoning,True
3,1000_01,Mars,False,D/39/P,,18.0,False,885.0,0.0,32.0,0.0,0.0,Alus Harte,False
4,3149_01,Earth,False,G/494/S,TRAPPIST-1e,18.0,False,0.0,730.0,79.0,0.0,,Natha Portananney,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,3444_04,Europa,False,B/131/S,TRAPPIST-1e,7.0,False,0.0,0.0,0.0,0.0,0.0,Wezena Scerodbox,True
8689,8264_01,Mars,False,F/1584/S,TRAPPIST-1e,21.0,False,32.0,0.0,1008.0,0.0,0.0,Lifes Ste,True
8690,4791_01,Earth,False,G/781/S,TRAPPIST-1e,43.0,False,63.0,1339.0,9.0,253.0,42.0,Louisy Morrodgers,True
8691,0504_04,Europa,False,B/19/S,TRAPPIST-1e,64.0,False,0.0,1737.0,11.0,401.0,752.0,Tauria Unpasine,False


In [359]:
# Посмотрим на пустые значения
df.isna().sum(axis=0)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [360]:
# Посмотрим на все планеты
df.HomePlanet.value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [361]:
# А теперь на пути назначения
df.Destination.value_counts()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

In [362]:
# С кабинами все немного сложнее. Так как они в одной строке, нужно сначала провести с ними хитрую вещь: мы получаем массив массивов сплитнутых строк,
# А затем переводим его в numpy array с shape=2 и транспонируем, а потом просто создаем новые столбцы, которые можно анализировать

df_split_cabin = df.dropna(subset=["Cabin"])

fn_split_room = lambda row: np.array(row['Cabin'].split("/"))

cabin_data = np.stack(df_split_cabin.apply(fn_split_room, axis=1))
decks, numbers, sides = cabin_data.T
df_split_cabin["deck"] = decks
df_split_cabin["room_number"] = numbers # этот столбец нам не нужен
df_split_cabin["room_side"] = sides
df = df_split_cabin


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_split_cabin["deck"] = decks
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_split_cabin["room_number"] = numbers # этот столбец нам не нужен
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_split_cabin["room_side"] = sides


In [363]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,deck,room_number,room_side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,98,P
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,1499,S
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,1500,S
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,608,S


In [364]:
# Количество людей по палубам
df.deck.value_counts()

F    2794
G    2559
E     876
B     779
C     747
D     478
A     256
T       5
Name: deck, dtype: int64

In [365]:
# Количество людей по сторонам
df.room_side.value_counts()

S    4288
P    4206
Name: room_side, dtype: int64

In [366]:
df.deck.astype('str')

0       B
1       F
2       A
3       A
4       F
       ..
8688    A
8689    G
8690    G
8691    E
8692    E
Name: deck, Length: 8494, dtype: object

In [367]:
# Начинаем готовить данные для модели. NaN не обрабатываем, тк уже дропнули их
# Делаем соответствие палуб номерам. Палуба T - командирская, а остальные уже по классам
map_dict = {'T': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
df.deck = df.deck.astype('str').map(map_dict)


In [368]:
df.deck.value_counts()

6    2794
7    2559
5     876
2     779
3     747
4     478
1     256
0       5
Name: deck, dtype: int64

In [369]:
# Меняем бинарные признаки на 0 и 1.
df.room_side = df.room_side.map({"P": 0, "S":1})
df.CryoSleep = df.CryoSleep.map({False: 0, True: 1})
df.VIP = df.VIP.map({False: 0, True: 1})
df.Transported = df.Transported.map({False: 0, True: 1})

In [370]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,deck,room_number,room_side
0,0001_01,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,2,0,0
1,0002_01,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,6,0,1
2,0003_01,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,1,0,1
3,0003_02,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,1,0,1
4,0004_01,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,6,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0.0,A/98/P,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,1,98,0
8689,9278_01,Earth,1.0,G/1499/S,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,7,1499,1
8690,9279_01,Earth,0.0,G/1500/S,TRAPPIST-1e,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,7,1500,1
8691,9280_01,Europa,0.0,E/608/S,55 Cancri e,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,5,608,1


In [371]:
# Теперь необходимо разобраться с категориальными признаками. Привет, OneHotEncoder
# Закодируем в OneHotEncode столбец HomePlanet
dummies = pd.get_dummies(df.HomePlanet, prefix="hp")
dummies

Unnamed: 0,hp_Earth,hp_Europa,hp_Mars
0,0,1,0
1,1,0,0
2,0,1,0
3,0,1,0
4,1,0,0
...,...,...,...
8688,0,1,0
8689,1,0,0
8690,1,0,0
8691,0,1,0


In [372]:
# Пример увидели, можно присоединять к основному датафрейму
df = df.join(dummies, how='right')
# Второй раз запускать не нужно, так как колонки будут накладываться и будет вылетать Эксепшн

In [373]:
# Делаем то же самое, но уже с Destination
dummies_dest = pd.get_dummies(df.Destination, prefix="dest")
df = df.join(dummies_dest, how='right')
# Здесь тоже второй раз запускать не нужно, так как колонки будут накладываться и будет вылетать Эксепшн

In [374]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Transported,deck,room_number,room_side,hp_Earth,hp_Europa,hp_Mars,dest_55 Cancri e,dest_PSO J318.5-22,dest_TRAPPIST-1e
0,0001_01,Europa,0.0,B/0/P,TRAPPIST-1e,39.0,0.0,0.0,0.0,0.0,...,0,2,0,0,0,1,0,0,0,1
1,0002_01,Earth,0.0,F/0/S,TRAPPIST-1e,24.0,0.0,109.0,9.0,25.0,...,1,6,0,1,1,0,0,0,0,1
2,0003_01,Europa,0.0,A/0/S,TRAPPIST-1e,58.0,1.0,43.0,3576.0,0.0,...,0,1,0,1,0,1,0,0,0,1
3,0003_02,Europa,0.0,A/0/S,TRAPPIST-1e,33.0,0.0,0.0,1283.0,371.0,...,0,1,0,1,0,1,0,0,0,1
4,0004_01,Earth,0.0,F/1/S,TRAPPIST-1e,16.0,0.0,303.0,70.0,151.0,...,1,6,1,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0.0,A/98/P,55 Cancri e,41.0,1.0,0.0,6819.0,0.0,...,0,1,98,0,0,1,0,1,0,0
8689,9278_01,Earth,1.0,G/1499/S,PSO J318.5-22,18.0,0.0,0.0,0.0,0.0,...,0,7,1499,1,1,0,0,0,1,0
8690,9279_01,Earth,0.0,G/1500/S,TRAPPIST-1e,26.0,0.0,0.0,0.0,1872.0,...,1,7,1500,1,1,0,0,0,0,1
8691,9280_01,Europa,0.0,E/608/S,55 Cancri e,32.0,0.0,0.0,1049.0,0.0,...,0,5,608,1,0,1,0,1,0,0


In [375]:
# Методом пристального взгляда мы понимаем, что следующие колонки модели погоду не сделают
dropcols = "PassengerId Name Cabin Destination HomePlanet room_number".split()
df = df.drop(dropcols, axis=1)

In [376]:
df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,deck,room_side,hp_Earth,hp_Europa,hp_Mars,dest_55 Cancri e,dest_PSO J318.5-22,dest_TRAPPIST-1e
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2,0,0,1,0,0,0,1
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1,6,1,1,0,0,0,0,1
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0,1,1,0,1,0,0,0,1
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0,1,1,0,1,0,0,0,1
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1,6,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,0,1,0,0,1,0,1,0,0
8689,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0,7,1,1,0,0,0,1,0
8690,0.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,1,7,1,1,0,0,0,0,1
8691,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,0,5,1,0,1,0,1,0,0


In [377]:
df.columns.values

array(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'Transported', 'deck',
       'room_side', 'hp_Earth', 'hp_Europa', 'hp_Mars',
       'dest_55 Cancri e', 'dest_PSO J318.5-22', 'dest_TRAPPIST-1e'],
      dtype=object)

In [378]:
# Теперь перенесём целевой признак в самый конец датафрейма, чтобы дальше было проще с ним работать
cols = list(df.columns.values)
cols.pop(cols.index("Transported"))
df = df[cols+["Transported"]]
df.head(3)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,room_side,hp_Earth,hp_Europa,hp_Mars,dest_55 Cancri e,dest_PSO J318.5-22,dest_TRAPPIST-1e,Transported
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,0,1,0,0,0,1,0
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,6,1,1,0,0,0,0,1,1
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,1,1,0,1,0,0,0,1,0


In [379]:
# Данных много, дропнем все наны
df = df.dropna().reset_index(drop=True)

In [380]:
from sklearn.tree import DecisionTreeClassifier
from random import randint

model_tree = DecisionTreeClassifier(max_depth=8)
accuracies_tree = []

kfold = KFold(n_splits=10, shuffle=True, random_state=randint(1, 100))
for index_train, index_test in kfold.split(df):
    x_train, y_train = df.iloc[index_train].drop(["Transported"], axis=1), df.iloc[index_train].Transported
    x_test, y_test = df.iloc[index_test].drop(["Transported"], axis=1), df.iloc[index_test].Transported
    model_tree.fit(x_train, y_train)
    accuracies_tree.append(np.mean(model_tree.predict(x_test) == y_test))

"Доля правильных ответов", np.mean(accuracies_tree)

('Доля правильных ответов', 0.7936211581522475)

In [381]:
# Попробуем то же самое только на CatBoost
from catboost import CatBoostClassifier
from random import randint

model_catboost = CatBoostClassifier()
accuracies_catboost = []


kfold = KFold(n_splits=10, shuffle=True, random_state=randint(1, 100))
for index_train, index_test in kfold.split(df):
    x_train, y_train = df.iloc[index_train].drop(["Transported"], axis=1), df.iloc[index_train].Transported
    x_test, y_test = df.iloc[index_test].drop(["Transported"], axis=1), df.iloc[index_test].Transported
    model_catboost.fit(x_train, y_train)
    accuracies_catboost.append(np.mean(model_catboost.predict(x_test) == y_test))
"Доля правильных ответов", np.mean(accuracies_catboost)

Learning rate set to 0.022722
0:	learn: 0.6826675	total: 2.41ms	remaining: 2.4s
1:	learn: 0.6721412	total: 4.82ms	remaining: 2.4s
2:	learn: 0.6611522	total: 7.02ms	remaining: 2.33s
3:	learn: 0.6510462	total: 9.25ms	remaining: 2.3s
4:	learn: 0.6418877	total: 11.6ms	remaining: 2.31s
5:	learn: 0.6326620	total: 13.9ms	remaining: 2.3s
6:	learn: 0.6253958	total: 16.4ms	remaining: 2.33s
7:	learn: 0.6183327	total: 18.9ms	remaining: 2.35s
8:	learn: 0.6110140	total: 21.5ms	remaining: 2.37s
9:	learn: 0.6034430	total: 23.9ms	remaining: 2.36s
10:	learn: 0.5970502	total: 26.3ms	remaining: 2.37s
11:	learn: 0.5907226	total: 29ms	remaining: 2.39s
12:	learn: 0.5844087	total: 31.8ms	remaining: 2.42s
13:	learn: 0.5781527	total: 34.5ms	remaining: 2.43s
14:	learn: 0.5731679	total: 36.9ms	remaining: 2.42s
15:	learn: 0.5681403	total: 39.3ms	remaining: 2.42s
16:	learn: 0.5626644	total: 41.7ms	remaining: 2.41s
17:	learn: 0.5581095	total: 44.1ms	remaining: 2.4s
18:	learn: 0.5539742	total: 46.4ms	remaining: 2.4s


('Доля правильных ответов', 0.8099979680141521)