In [30]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from pycaret.classification import *

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("sample_submission.csv")

def preprocess(df, is_train=True):
    def compute_total_exp(x):
        return x["RoomService"] + x["FoodCourt"] + x["ShoppingMall"] + x["Spa"] + x["VRDeck"]

    df.loc[:, "TotalExp"] = df.apply(compute_total_exp, axis=1)
    df[["CabinDeck", "CabinNum", "CabinSide"]] = df.Cabin.str.split("/", expand=True)
    df[["PassengerGroup", "PassengerNum"]] = df.PassengerId.str.split("_", expand=True)
    df.PassengerGroup = df.PassengerGroup.astype(int)
    
    df.CryoSleep = df.CryoSleep.map(lambda x: 1 if not np.isnan(x) and x == True else 0)
    df.VIP = df.VIP.map(lambda x: 1 if not np.isnan(x) and x == True else 0)
    
    if is_train:
        df.Transported = df.Transported.astype(int)

    df = df.drop(["PassengerId", "Name", "Cabin", "PassengerNum"], axis=1)
    
    return df

train_df = preprocess(train_df)
test_df = preprocess(test_df, is_train=False)
train_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,TotalExp,CabinDeck,CabinNum,CabinSide,PassengerGroup
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,B,0,P,1
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,736.0,F,0,S,2
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,10383.0,A,0,S,3
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,5176.0,A,0,S,3
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,1091.0,F,1,S,4


In [31]:
numeric_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalExp", "CabinNum"]
categorical_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP", "CabinDeck", "CabinSide"]

for numeric_col in numeric_cols:
    imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    train_df[numeric_col] = imputer.fit_transform(train_df[numeric_col].values.reshape(-1, 1))
    test_df[numeric_col] = imputer.transform(test_df[numeric_col].values.reshape(-1, 1))

for categorical_col in categorical_cols:
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    train_df[categorical_col] = imputer.fit_transform(train_df[categorical_col].values.reshape(-1, 1))
    test_df[categorical_col] = imputer.transform(test_df[categorical_col].values.reshape(-1, 1))

In [32]:
for numeric_col in numeric_cols:
    scaler = StandardScaler()
    train_df[numeric_col] = scaler.fit_transform(train_df[numeric_col].values.reshape(-1, 1))
    test_df[numeric_col] = scaler.transform(test_df[numeric_col].values.reshape(-1, 1))

for categorical_col in categorical_cols:
    encoder = LabelEncoder()
    train_df[categorical_col] = encoder.fit_transform(train_df[categorical_col].values.reshape(-1, 1))
    test_df[categorical_col] = encoder.transform(test_df[categorical_col].values.reshape(-1, 1))

train_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,TotalExp,CabinDeck,CabinNum,CabinSide,PassengerGroup
0,1,0,2,0.711945,0,-0.333105,-0.281027,-0.283579,-0.270626,-0.263003,0,-0.52048,1,-1.177238,0,1
1,0,0,2,-0.334037,0,-0.168073,-0.275387,-0.241771,0.217158,-0.224205,1,-0.248103,5,-1.177238,1,2
2,1,0,2,2.036857,1,-0.268001,1.959998,-0.283579,5.695623,-0.219796,0,3.322033,0,-1.177238,1,3
3,1,0,2,0.293552,0,-0.333105,0.52301,0.336851,2.687176,-0.092818,0,1.39504,0,-1.177238,1,3
4,0,0,2,-0.891895,0,0.125652,-0.237159,-0.031059,0.231374,-0.26124,1,-0.116726,5,-1.175264,1,4


In [33]:
from pycaret.classification import *

setup_clf = setup(data=train_df, target='Transported',
                  feature_selection=True, 
                  fix_imbalance=True, )

Unnamed: 0,Description,Value
0,session_id,2717
1,Target,Transported
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(8693, 16)"
5,Missing Values,False
6,Numeric Features,9
7,Categorical Features,6
8,Ordinal Features,False
9,High Cardinality Features,False


In [34]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8128,0.907,0.8114,0.8123,0.8117,0.6256,0.6257,3.081
lightgbm,Light Gradient Boosting Machine,0.8061,0.9001,0.8008,0.8081,0.8043,0.6121,0.6123,0.028
gbc,Gradient Boosting Classifier,0.8036,0.8963,0.8232,0.7907,0.8065,0.6073,0.608,0.103
rf,Random Forest Classifier,0.7992,0.8876,0.7595,0.8235,0.7901,0.5982,0.6001,0.1
xgboost,Extreme Gradient Boosting,0.7988,0.8938,0.7862,0.805,0.7954,0.5976,0.598,0.101
et,Extra Trees Classifier,0.797,0.8738,0.7499,0.8265,0.7861,0.5939,0.5967,0.082
ada,Ada Boost Classifier,0.7944,0.8828,0.819,0.7793,0.7985,0.5889,0.5899,0.034
lr,Logistic Regression,0.7911,0.8771,0.8061,0.7812,0.7934,0.5823,0.5828,0.186
lda,Linear Discriminant Analysis,0.7712,0.8576,0.7106,0.8068,0.7555,0.5422,0.5462,0.01
ridge,Ridge Classifier,0.7711,0.0,0.7106,0.8065,0.7554,0.5419,0.5458,0.008


<catboost.core.CatBoostClassifier at 0x7f2f727181f0>

In [36]:
catboost = create_model("catboost")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7931,0.8895,0.802,0.7864,0.7941,0.5862,0.5864
1,0.8144,0.9125,0.8218,0.8084,0.8151,0.6289,0.629
2,0.8374,0.9212,0.8416,0.8333,0.8374,0.6749,0.6749
3,0.8227,0.9225,0.8185,0.8239,0.8212,0.6453,0.6453
4,0.8128,0.9018,0.8251,0.8039,0.8143,0.6257,0.6259
5,0.8191,0.9021,0.8079,0.8243,0.8161,0.6381,0.6382
6,0.8092,0.9013,0.8113,0.8059,0.8086,0.6184,0.6184
7,0.8043,0.9005,0.7881,0.8123,0.8,0.6085,0.6087
8,0.8026,0.9122,0.7954,0.806,0.8007,0.6052,0.6053
9,0.8125,0.9064,0.802,0.8182,0.81,0.625,0.6251


In [37]:
tuned_model = tune_model(catboost, n_iter=300)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.803,0.886,0.8053,0.8,0.8026,0.6059,0.6059
1,0.8095,0.9088,0.8152,0.8046,0.8098,0.6191,0.6191
2,0.8391,0.9171,0.8251,0.8475,0.8361,0.6781,0.6783
3,0.8325,0.9206,0.8218,0.8384,0.83,0.665,0.6651
4,0.8128,0.8981,0.8185,0.8078,0.8131,0.6256,0.6257
5,0.8207,0.9031,0.8046,0.8294,0.8168,0.6414,0.6416
6,0.8092,0.9029,0.8079,0.8079,0.8079,0.6184,0.6184
7,0.8158,0.9039,0.7881,0.8322,0.8095,0.6314,0.6323
8,0.8076,0.9109,0.7822,0.8229,0.802,0.6151,0.6158
9,0.8092,0.9048,0.7921,0.8191,0.8054,0.6184,0.6187


In [38]:
bagging_model = ensemble_model(tuned_model, n_estimators=500)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.798,0.8851,0.7987,0.7961,0.7974,0.5961,0.5961
1,0.8062,0.9104,0.8119,0.8013,0.8066,0.6125,0.6125
2,0.8358,0.9184,0.8185,0.8464,0.8322,0.6715,0.6719
3,0.8342,0.9205,0.8251,0.8389,0.8319,0.6683,0.6684
4,0.8161,0.9011,0.8185,0.8131,0.8158,0.6322,0.6322
5,0.8158,0.9012,0.798,0.8253,0.8114,0.6315,0.6318
6,0.8109,0.8994,0.8113,0.8086,0.8099,0.6217,0.6217
7,0.8125,0.9046,0.7781,0.8333,0.8048,0.6248,0.6262
8,0.8043,0.9117,0.7921,0.8108,0.8013,0.6085,0.6087
9,0.8109,0.9063,0.7888,0.8241,0.8061,0.6217,0.6222


In [39]:
predict_model(bagging_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.8056,0.8968,0.792,0.8256,0.8085,0.6113,0.6118


Unnamed: 0,CabinDeck_0,TotalExp,ShoppingMall,Destination_2,PassengerGroup,FoodCourt,CryoSleep_1,CabinDeck_1,Spa,HomePlanet_2,...,Destination_1,CabinNum,HomePlanet_1,CabinSide_1,RoomService,VIP_1,Destination_0,Transported,Label,Score
0,0.0,-0.520480,-0.283579,1.0,3292.0,-0.281027,0.0,1.0,-0.270626,0.0,...,0.0,-0.938395,1.0,1.0,-0.333105,1.0,0.0,1,1,0.7556
1,0.0,-0.207025,1.132874,0.0,4938.0,-0.281027,0.0,0.0,-0.270626,0.0,...,0.0,0.409786,0.0,1.0,-0.333105,0.0,1.0,1,1,0.9298
2,0.0,-0.520480,-0.283579,0.0,3605.0,-0.281027,1.0,0.0,-0.270626,0.0,...,0.0,-0.008683,0.0,0.0,-0.333105,0.0,1.0,1,1,0.7342
3,0.0,0.221525,-0.056144,1.0,444.0,-0.281027,0.0,0.0,-0.270626,0.0,...,0.0,-0.981821,0.0,0.0,2.496651,1.0,0.0,0,0,0.9325
4,0.0,1.126734,6.335452,1.0,7904.0,-0.276013,0.0,0.0,-0.270626,0.0,...,0.0,1.343445,0.0,0.0,0.401209,0.0,0.0,1,0,0.5473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2603,0.0,-0.520480,-0.283579,0.0,2881.0,-0.281027,0.0,0.0,-0.270626,0.0,...,0.0,-0.275161,0.0,1.0,-0.333105,0.0,1.0,0,0,0.6429
2604,1.0,0.267415,-0.283579,1.0,3781.0,0.320591,0.0,0.0,-0.246637,0.0,...,0.0,-1.090386,1.0,1.0,0.909929,0.0,0.0,0,0,0.7759
2605,1.0,4.497029,-0.206652,0.0,2506.0,1.289445,0.0,0.0,9.503716,0.0,...,0.0,-1.139734,0.0,0.0,-0.333105,0.0,1.0,0,0,0.9431
2606,0.0,-0.227379,-0.283579,0.0,5027.0,0.180840,0.0,0.0,-0.225313,0.0,...,0.0,-0.543613,0.0,1.0,-0.327048,0.0,1.0,0,1,0.7064


In [40]:
final_model = finalize_model(bagging_model)

In [41]:
pred = predict_model(final_model, data=test_df)
pred = pred.Label.values

In [42]:
submission_df.loc[:, "Transported"] = [True if p == 1 else False for p in pred]
submission_df.to_csv("catboost_bagging_submission.csv", index=False)