In [1]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV

In [32]:
spaceship_train = pd.read_csv('data/train.csv')
spaceship_test = pd.read_csv('data/test.csv')

In [33]:
spaceship_train.drop(['Name'],axis=1,inplace=True)
spaceship_test.drop(['Name'],axis=1,inplace=True)

In [34]:
# separate Cabin to deck/num/side
spaceship_train['Deck'] = spaceship_train.Cabin.str.split('/').str[0] 
spaceship_train['Side'] = spaceship_train.Cabin.str.split('/').str[2] # Port or Starboard
spaceship_train.drop('Cabin', axis=1, inplace=True)

spaceship_test['Deck'] = spaceship_test.Cabin.str.split('/').str[0]
spaceship_test['Side'] = spaceship_test.Cabin.str.split('/').str[2]
spaceship_test.drop('Cabin', axis=1, inplace=True)

In [35]:
spaceship_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S


In [36]:
spaceship_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,0013_01,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,G,S
1,0018_01,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,F,S
2,0019_01,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,C,S
3,0021_01,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,C,S
4,0023_01,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,F,S


In [37]:
spaceship_train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Deck            199
Side            199
dtype: int64

In [38]:
spaceship_test.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Deck            100
Side            100
dtype: int64

In [39]:
spaceship_train.fillna({
    "CryoSleep": 0, "Age": 0, "VIP": 0, "RoomService": 0, 
    "FoodCourt": 0, "ShoppingMall": 0, "Spa": 0, "VRDeck": 0, 
    "Deck" : "Unknown", "Side" : "Unknown",
    "Destination": "Unknown Destination", "HomePlanet": "Unknown HomePlanet",
}, inplace=True)

spaceship_test.fillna({
    "CryoSleep": 0, "Age": 0, "VIP": 0, "RoomService": 0, 
    "FoodCourt": 0, "ShoppingMall": 0, "Spa": 0, "VRDeck": 0, 
    "Deck" : "Unknown", "Side" : "Unknown",
    "Destination": "Unknown Destination", "HomePlanet": "Unknown HomePlanet",
}, inplace=True)

In [40]:
# check for missing values
spaceship_train.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Deck            0
Side            0
dtype: int64

In [41]:
spaceship_test.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Deck            0
Side            0
dtype: int64

In [42]:
spaceship_train_categorial = pd.get_dummies(spaceship_train[['Destination', 'HomePlanet', 'Deck', 'Side']])
spaceship_test_categorial = pd.get_dummies(spaceship_test[['Destination', 'HomePlanet', 'Deck', 'Side']])

In [43]:
spaceship_train.drop(['Destination', 'HomePlanet', 'Deck', 'Side'], axis=1, inplace=True)
spaceship_train = pd.concat([spaceship_train, spaceship_train_categorial], axis=1)

spaceship_test.drop(['Destination', 'HomePlanet', 'Deck', 'Side'], axis=1, inplace=True)
spaceship_test = pd.concat([spaceship_test, spaceship_test_categorial], axis=1)

In [44]:
spaceship_train.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Side_P,Side_S,Side_Unknown
0,0001_01,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,...,0,0,0,0,0,0,0,1,0,0
1,0002_01,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,...,0,0,0,1,0,0,0,0,1,0
2,0003_01,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,...,0,0,0,0,0,0,0,0,1,0
3,0003_02,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,...,0,0,0,0,0,0,0,0,1,0
4,0004_01,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,...,0,0,0,1,0,0,0,0,1,0


In [45]:
spaceship_test.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Destination_55 Cancri e,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_Unknown,Side_P,Side_S,Side_Unknown
0,0013_01,True,27.0,False,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,1,0,0,0,1,0
1,0018_01,False,19.0,False,0.0,9.0,0.0,2823.0,0.0,0,...,0,0,0,1,0,0,0,0,1,0
2,0019_01,True,31.0,False,0.0,0.0,0.0,0.0,0.0,1,...,1,0,0,0,0,0,0,0,1,0
3,0021_01,False,38.0,False,0.0,6652.0,0.0,181.0,585.0,0,...,1,0,0,0,0,0,0,0,1,0
4,0023_01,False,20.0,False,10.0,0.0,635.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,1,0


In [48]:
print(f"Train shape: {spaceship_train.shape}\nTest shape: {spaceship_test.shape}")

Train shape: (8693, 30)
Test shape: (4277, 29)


In [49]:
# separate data to features and labels
X_train = spaceship_train.drop(['PassengerId', 'Transported'], axis=1)
y_train = spaceship_train['Transported']
X_test = spaceship_test.drop('PassengerId', axis=1)

In [50]:
# normalize the data
transformer = Normalizer().fit(X_train)
X_train = transformer.transform(X_train)
X_test = transformer.transform(X_test)

In [24]:
# finding best parameters for CatBoost model
params = {
    'depth': [4, 7, 10],
    'learning_rate' : [0.03, 0.1, 0.15],
    'l2_leaf_reg': [1, 4, 9],
    'iterations': [300]
}

cb = CatBoostClassifier(task_type="GPU", devices='0:1')
grid_cb = GridSearchCV(cb, params, scoring="roc_auc", cv=3)
grid_cb.fit(X_train, y_train)
grid_cb.best_params_

0:	learn: 0.6794738	total: 8.15ms	remaining: 2.44s
1:	learn: 0.6681597	total: 14.9ms	remaining: 2.22s
2:	learn: 0.6574573	total: 22.1ms	remaining: 2.19s
3:	learn: 0.6450218	total: 28.3ms	remaining: 2.09s
4:	learn: 0.6343836	total: 34.5ms	remaining: 2.03s
5:	learn: 0.6243331	total: 41.8ms	remaining: 2.05s
6:	learn: 0.6146994	total: 48ms	remaining: 2.01s
7:	learn: 0.6064118	total: 55.6ms	remaining: 2.03s
8:	learn: 0.5982613	total: 61.7ms	remaining: 2s
9:	learn: 0.5904950	total: 68.2ms	remaining: 1.98s
10:	learn: 0.5829762	total: 75ms	remaining: 1.97s
11:	learn: 0.5749870	total: 81.9ms	remaining: 1.97s
12:	learn: 0.5681802	total: 88.3ms	remaining: 1.95s
13:	learn: 0.5617080	total: 94.9ms	remaining: 1.94s
14:	learn: 0.5562642	total: 101ms	remaining: 1.93s
15:	learn: 0.5507152	total: 108ms	remaining: 1.92s
16:	learn: 0.5453445	total: 115ms	remaining: 1.91s
17:	learn: 0.5397843	total: 122ms	remaining: 1.91s
18:	learn: 0.5355326	total: 129ms	remaining: 1.9s
19:	learn: 0.5315726	total: 136ms	r

{'depth': 7, 'iterations': 300, 'l2_leaf_reg': 4, 'learning_rate': 0.03}

In [28]:
# evaluating model with best parameters
model = CatBoostClassifier(iterations=300, task_type="GPU", devices='0:1', loss_function="Logloss", depth=7, learning_rate=0.03, l2_leaf_reg=4)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
model.eval_metrics(Pool(X_train, y_train), metrics=['Logloss', 'AUC', 'F1'], plot=True)

0:	learn: 0.6744701	total: 12.1ms	remaining: 3.62s
1:	learn: 0.6577658	total: 23ms	remaining: 3.42s
2:	learn: 0.6429110	total: 31.8ms	remaining: 3.15s
3:	learn: 0.6267789	total: 42.3ms	remaining: 3.13s
4:	learn: 0.6144781	total: 51.3ms	remaining: 3.02s
5:	learn: 0.6017981	total: 61.6ms	remaining: 3.02s
6:	learn: 0.5901883	total: 72.1ms	remaining: 3.02s
7:	learn: 0.5807542	total: 81.3ms	remaining: 2.97s
8:	learn: 0.5715498	total: 91.7ms	remaining: 2.97s
9:	learn: 0.5616466	total: 101ms	remaining: 2.94s
10:	learn: 0.5540487	total: 113ms	remaining: 2.97s
11:	learn: 0.5468898	total: 124ms	remaining: 2.97s
12:	learn: 0.5390275	total: 134ms	remaining: 2.96s
13:	learn: 0.5316420	total: 144ms	remaining: 2.94s
14:	learn: 0.5248135	total: 155ms	remaining: 2.94s
15:	learn: 0.5181813	total: 164ms	remaining: 2.91s
16:	learn: 0.5109093	total: 174ms	remaining: 2.9s
17:	learn: 0.5062234	total: 183ms	remaining: 2.87s
18:	learn: 0.5014083	total: 193ms	remaining: 2.86s
19:	learn: 0.4963508	total: 204ms	r

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

{'Logloss': [0.6744701478862767,
  0.6577657857260891,
  0.6429109762532796,
  0.6267788844407296,
  0.6144781327882913,
  0.6017981495248764,
  0.590188313404911,
  0.5807541120832481,
  0.5715497582085006,
  0.5616465075310565,
  0.5540487551747594,
  0.5468897945206505,
  0.5390274972853408,
  0.531641973582535,
  0.5248135821232579,
  0.518181241781232,
  0.5109093355402045,
  0.5062234333323484,
  0.5014082831413882,
  0.4963507977132316,
  0.4914047899528135,
  0.48709016191038884,
  0.4832164271260603,
  0.4795768182000059,
  0.47651551732631875,
  0.47343592614698354,
  0.4703990008386341,
  0.4672571083314746,
  0.46368496663932535,
  0.46097642761408836,
  0.45783562209207146,
  0.4558237115357827,
  0.45380917682160465,
  0.45161019352756,
  0.4497137769494775,
  0.4477708542802555,
  0.4454413967455964,
  0.44330191992286044,
  0.4411098005381044,
  0.43969865365599875,
  0.4378514890418156,
  0.4361407922439794,
  0.4348625480983818,
  0.433170178897321,
  0.43134647731104

In [None]:
# predict on test data
test_y_pred = model.predict(X_test)

In [None]:
#put prediction to submission file
submission = pd.DataFrame({
        "PassengerId": spaceship_test["PassengerId"],
        "Transported": test_y_pred
    })
submission.to_csv('catboost_submission.csv', index=False)