In [264]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [265]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
sub = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

In [266]:
def preprocess(df):
    df[["Deck", "Cabin_num", "Side"]] = df["Cabin"].str.split("/", expand=True)
    
    df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']] = df[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num']].fillna(value=0)
    
    df['CryoSleep'] = df['CryoSleep'].astype('int')
    df['VIP'] = df['VIP'].astype('int')
    df['Cabin_num'] = df['Cabin_num'].astype('int')
    
    df[['HomePlanet', 'Destination', 'Deck', 'Side']] = df[['HomePlanet', 'Destination', 'Deck', 'Side']].fillna(value='')
        
    df['HomePlanet'] = df['HomePlanet'].astype('category')
    df['Destination'] = df['Destination'].astype('category')
    df['Deck'] = df['Deck'].astype('category')
    df['Side'] = df['Side'].astype('category')
    
    df = df.drop(['Cabin', 'PassengerId', 'Name'], axis=1)
    
    return df

In [267]:
df_train = preprocess(df)
df_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Cabin_num,Side
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [268]:
df_train.dtypes

HomePlanet      category
CryoSleep          int64
Destination     category
Age              float64
VIP                int64
RoomService      float64
FoodCourt        float64
ShoppingMall     float64
Spa              float64
VRDeck           float64
Transported         bool
Deck            category
Cabin_num          int64
Side            category
dtype: object

In [269]:
X_train, X_val, y_train, y_val = train_test_split(df_train.drop('Transported',axis=1), df_train['Transported'],test_size=0.2, random_state=42)

In [270]:
cat_features = np.where(X_train.dtypes == 'category')[0]

In [271]:
cat_features

array([ 0,  2, 10, 12])

In [272]:
X_train.dtypes

HomePlanet      category
CryoSleep          int64
Destination     category
Age              float64
VIP                int64
RoomService      float64
FoodCourt        float64
ShoppingMall     float64
Spa              float64
VRDeck           float64
Deck            category
Cabin_num          int64
Side            category
dtype: object

In [292]:
y_train.astype(float)

2333    0.0
2589    0.0
8302    1.0
8177    1.0
500     1.0
       ... 
5734    1.0
5191    0.0
5390    0.0
860     0.0
7270    0.0
Name: Transported, Length: 6954, dtype: float64

In [295]:
# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

In [296]:
best_params = {'iterations': 475,
 'learning_rate': 0.027583475549166746,
 'depth': 4,
 'l2_leaf_reg': 1.0551779964424746e-05,
 'bootstrap_type': 'Bayesian',
 'random_strength': 2.0931628460945333e-07,
 'bagging_temperature': 0.923385947687978,
 'od_type': 'Iter',
 'od_wait': 26}

In [297]:
# clf = CatBoostClassifier(iterations = study.best_params.get('iterations'),
#                         learning_rate = study.best_params.get('learning_rate'),
#                          depth = study.best_params.get('depth'),
#                          l2_leaf_reg = study.best_params.get('l2_leaf_reg'),
#                          bootstrap_type = study.best_params.get('bootstrap_type'),
#                          random_strength = study.best_params.get('random_strength'),
#                          bagging_temperature = study.best_params.get('bagging_temperature'),
#                          od_type = study.best_params.get('od_type'),
#                          od_wait = study.best_params.get('od_wait')
#                         )

clf = CatBoostClassifier(iterations = best_params.get('iterations'),
                        learning_rate = best_params.get('learning_rate'),
                         depth = best_params.get('depth'),
                         l2_leaf_reg = best_params.get('l2_leaf_reg'),
                         bootstrap_type = best_params.get('bootstrap_type'),
                         random_strength = best_params.get('random_strength'),
                         bagging_temperature = best_params.get('bagging_temperature'),
                         od_type = best_params.get('od_type'),
                         od_wait = best_params.get('od_wait')
                        )

clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val))

0:	learn: 0.6825575	test: 0.6826892	best: 0.6826892 (0)	total: 8.91ms	remaining: 4.22s
1:	learn: 0.6727851	test: 0.6729738	best: 0.6729738 (1)	total: 17.5ms	remaining: 4.14s
2:	learn: 0.6635616	test: 0.6639341	best: 0.6639341 (2)	total: 25.5ms	remaining: 4.02s
3:	learn: 0.6549345	test: 0.6552253	best: 0.6552253 (3)	total: 33.7ms	remaining: 3.97s
4:	learn: 0.6467422	test: 0.6472178	best: 0.6472178 (4)	total: 40.8ms	remaining: 3.83s
5:	learn: 0.6390790	test: 0.6399905	best: 0.6399905 (5)	total: 47.9ms	remaining: 3.74s
6:	learn: 0.6317135	test: 0.6327528	best: 0.6327528 (6)	total: 54.5ms	remaining: 3.65s
7:	learn: 0.6248274	test: 0.6261433	best: 0.6261433 (7)	total: 61.6ms	remaining: 3.6s
8:	learn: 0.6183642	test: 0.6198018	best: 0.6198018 (8)	total: 68.3ms	remaining: 3.54s
9:	learn: 0.6125638	test: 0.6146019	best: 0.6146019 (9)	total: 75.7ms	remaining: 3.52s
10:	learn: 0.6057870	test: 0.6083682	best: 0.6083682 (10)	total: 84.4ms	remaining: 3.56s
11:	learn: 0.6007411	test: 0.6035201	best:

<catboost.core.CatBoostClassifier at 0x7995be47e5f0>

In [300]:
ids = test['PassengerId']
X_test = preprocess(test)

In [301]:
preds = clf.predict(X_test)

In [302]:
type(ids)

pandas.core.series.Series

In [303]:
res_df = pd.concat([ids, pd.Series(preds)], axis=1)
res_df.columns = ['PassengerId', 'Transported']
res_df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [304]:
res_df.to_csv('submission.csv', index=False)