In [7]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.preprocessing import *

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import *

pd.set_option("display.max_columns", None)

import joblib

SEED = 42
n_splits = 8

load and preprocess data

In [8]:
sample_sub = pd.read_csv("../../../data/ml_01_1.csv")

test = pd.read_csv("../../../data/ml_01_2.csv")
train = pd.read_csv("../../../data/ml_01_3.csv")

train.drop(columns=["PassengerId"], inplace=True)
test.drop(columns=["PassengerId"], inplace=True)


def fe(df: pd.DataFrame):
    df["TotalServiceSpend"] = (
        df["RoomService"]
        + df["FoodCourt"]
        + df["ShoppingMall"]
        + df["Spa"]
        + df["VRDeck"]
    )
    df["ServiceSpendPerAge"] = df["TotalServiceSpend"] / (df["Age"] + 1)
    df[["CabinDeck", "CabinNum", "CabinSide"]] = df["Cabin"].str.split(
        "/", expand=True
    )
    df["LastName"] = df["Name"].str.split().str[-1]
    df["IsFamily"] = df.groupby("LastName")["Name"].transform("size") > 1
    df["AgeGroup"] = pd.cut(
        df["Age"],
        bins=[0, 18, 30, 50, 80],
        labels=["Child", "YoungAdult", "Adult", "Senior"],
    )
    df["TravelAlone"] = (
        df["Cabin"].duplicated(keep=False) | df["IsFamily"]
    ).astype(int)
    df["CryoSleepAndSpent"] = (
        df["CryoSleep"] & (df["TotalServiceSpend"] > 0)
    ).astype(int)
    df["VIPSpendMultiplier"] = df["VIP"] * df["TotalServiceSpend"]

    return df


train = fe(train)
test = fe(test)


def impute(df: pd.DataFrame):
    categorical_cols = [
        "HomePlanet",
        "CryoSleep",
        "Cabin",
        "Destination",
        "AgeGroup",
        "CabinDeck",
    ]
    for col in categorical_cols:
        if col in df.columns and df[col].dtype.name == "category":
            if "NoInformation" not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories("NoInformation")
            df[col].fillna("NoInformation", inplace=True)
        else:
            df[col].fillna("NoInformation", inplace=True)

    vip_threshold = df["VIPSpendMultiplier"].quantile(0.99)
    df["VIP"].fillna(df["VIPSpendMultiplier"] > vip_threshold, inplace=True)
    df["Name"].fillna("noName", inplace=True)
    df["LastName"].fillna("noName", inplace=True)
    df["CabinSide"].fillna(df["CabinSide"].mode()[0], inplace=True)

    return df


train = impute(train)
test = impute(test)

num_c = [
    "Age",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
    "TotalServiceSpend",
    "ServiceSpendPerAge",
    "CabinNum",
    "VIPSpendMultiplier",
]

I = IterativeImputer(
    random_state=0,
    n_nearest_features=None,
    initial_strategy="mean",
)
train[num_c] = I.fit_transform(train[num_c])
test[num_c] = I.transform(test[num_c])


def update(df: pd.DataFrame):
    cat_c = [
        "HomePlanet",
        "CryoSleep",
        "Cabin",
        "Destination",
        "Name",
        "CabinDeck",
        "CabinSide",
        "LastName",
        "VIP",
        "IsFamily",
        "AgeGroup",
    ]

    for col in cat_c:
        df[col] = df[col].astype("category")

    return df


train = update(train)
test = update(test)


train.drop(["Name", "Cabin", "LastName"], axis=1, inplace=True)
test.drop(["Name", "Cabin", "LastName"], axis=1, inplace=True)

cols_encode = [
    "HomePlanet",
    "CryoSleep",
    "Destination",
    "CabinDeck",
    "CabinSide",
    "VIP",
    "IsFamily",
    "AgeGroup",
]

train = pd.get_dummies(train, columns=cols_encode).astype(int)
test = pd.get_dummies(test, columns=cols_encode).astype(int)

all_cols = train.columns
cols_Scale = all_cols.drop("Transported")

Sc = StandardScaler()
train[cols_Scale] = Sc.fit_transform(train[cols_Scale])
test[cols_Scale] = Sc.transform(test[cols_Scale])

final_cols = [
    "Age",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
    "TotalServiceSpend",
    "ServiceSpendPerAge",
    "CabinNum",
    "TravelAlone",
    "CryoSleepAndSpent",
    "VIPSpendMultiplier",
    "HomePlanet_Earth",
    "HomePlanet_Europa",
    "HomePlanet_Mars",
    "HomePlanet_NoInformation",
    "CryoSleep_False",
    "CryoSleep_True",
    "CryoSleep_NoInformation",
    "Destination_55 Cancri e",
    "Destination_NoInformation",
    "Destination_PSO J318.5-22",
    "Destination_TRAPPIST-1e",
    "CabinDeck_A",
    "CabinDeck_B",
    "CabinDeck_C",
    "CabinDeck_D",
    "CabinDeck_E",
    "CabinDeck_F",
    "CabinDeck_G",
    "CabinDeck_NoInformation",
    "CabinDeck_T",
    "CabinSide_P",
    "CabinSide_S",
    "VIP_False",
    "VIP_True",
    "IsFamily_False",
    "IsFamily_True",
    "AgeGroup_Child",
    "AgeGroup_YoungAdult",
    "AgeGroup_Adult",
    "AgeGroup_Senior",
    "AgeGroup_NoInformation",
    "Transported",
]

train = train[final_cols]

train_N = train.copy()
train_N.drop_duplicates(inplace=True)

In [9]:
train.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalServiceSpend,ServiceSpendPerAge,CabinNum,TravelAlone,CryoSleepAndSpent,VIPSpendMultiplier,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_NoInformation,CryoSleep_False,CryoSleep_True,CryoSleep_NoInformation,Destination_55 Cancri e,Destination_NoInformation,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_NoInformation,CabinDeck_T,CabinSide_P,CabinSide_S,VIP_False,VIP_True,IsFamily_False,IsFamily_True,AgeGroup_Child,AgeGroup_YoungAdult,AgeGroup_Adult,AgeGroup_Senior,AgeGroup_NoInformation,Transported
0,0.709887,-0.31443,-0.286279,-0.255314,-0.269932,-0.272732,-0.548617,-0.556791,-1.185363,-4.225735,0.0,-0.107888,-1.060617,1.754795,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,-0.174191,3.187347,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.153063,-0.02399,1.032865,-1.032865,0.153063,-0.153063,3.733152,-3.733152,-0.490707,-0.770444,1.500312,-0.304359,-0.206945,0
1,-0.335758,-0.161325,-0.280676,-0.21895,0.205522,-0.234305,-0.276378,-0.217726,-1.185363,0.236645,0.0,-0.107888,0.942847,-0.569867,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,-0.26787,0.26787,-0.490707,1.297953,-0.666528,-0.304359,-0.206945,1
2,2.034371,-0.254031,1.939939,-0.255314,5.5455,-0.229938,3.291955,1.489295,-1.185363,0.236645,0.0,10.260987,-1.060617,1.754795,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,5.740821,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,-6.533255,6.533255,-0.26787,0.26787,-0.490707,-0.770444,-0.666528,3.285592,-0.206945,0
3,0.291629,-0.31443,0.512445,0.284327,2.613102,-0.104178,1.365936,1.220381,-1.185363,0.236645,0.0,-0.107888,-1.060617,1.754795,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,5.740821,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,-0.26787,0.26787,-0.490707,-0.770444,1.500312,-0.304359,-0.206945,0
4,-0.893435,0.111174,-0.242701,-0.035676,0.219378,-0.270985,-0.145067,0.191492,-1.183388,0.236645,0.0,-0.107888,0.942847,-0.569867,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,-0.26787,0.26787,2.037875,-0.770444,-0.666528,-0.304359,-0.206945,1


In [10]:
test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalServiceSpend,ServiceSpendPerAge,CabinNum,TravelAlone,CryoSleepAndSpent,VIPSpendMultiplier,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_NoInformation,CryoSleep_False,CryoSleep_True,CryoSleep_NoInformation,Destination_55 Cancri e,Destination_NoInformation,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_NoInformation,CabinDeck_T,CabinSide_P,CabinSide_S,VIP_False,VIP_True,IsFamily_False,IsFamily_True,AgeGroup_Child,AgeGroup_YoungAdult,AgeGroup_Adult,AgeGroup_Senior,AgeGroup_NoInformation
0,-0.126629,-0.31443,-0.286279,-0.255314,-0.269932,-0.272732,-0.548617,-0.556791,-1.179438,0.236645,0.0,-0.107888,0.942847,-0.569867,-0.503664,-0.153848,-1.292858,1.364685,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,-0.688215,1.548235,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,-0.26787,0.26787,-0.490707,1.297953,-0.666528,-0.304359,-0.206945
1,-0.684306,-0.31443,-0.280676,-0.255314,2.174888,-0.272732,0.498913,1.09177,-1.177463,-4.225735,0.0,-0.107888,0.942847,-0.569867,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,3.733152,-3.733152,-0.490707,1.297953,-0.666528,-0.304359,-0.206945
2,0.15221,-0.31443,-0.286279,-0.255314,-0.269932,-0.272732,-0.548617,-0.556791,-1.185363,-4.225735,0.0,-0.107888,-1.060617,1.754795,-0.503664,-0.153848,-1.292858,1.364685,-0.160005,1.956897,-0.146233,-0.317487,-1.459188,-0.174191,-0.313741,3.261474,-0.241218,-0.334759,-0.688215,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,3.733152,-3.733152,-0.490707,-0.770444,1.500312,-0.304359,-0.206945
3,0.640177,-0.31443,3.854885,-0.255314,-0.11318,0.23817,2.19523,1.664674,-1.183388,-4.225735,0.0,-0.107888,-1.060617,1.754795,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,-0.174191,-0.313741,3.261474,-0.241218,-0.334759,-0.688215,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,3.733152,-3.733152,-0.490707,-0.770444,1.500312,-0.304359,-0.206945
4,-0.614597,-0.300384,-0.286279,0.668331,-0.269932,-0.272732,-0.310038,-0.206034,-1.175488,0.236645,0.0,-0.107888,0.942847,-0.569867,-0.503664,-0.153848,0.77348,-0.73277,-0.160005,-0.511013,-0.146233,-0.317487,0.685313,-0.174191,-0.313741,-0.30661,-0.241218,-0.334759,1.453035,-0.645897,-0.153063,-0.02399,-0.968181,0.968181,0.153063,-0.153063,-0.26787,0.26787,-0.490707,1.297953,-0.666528,-0.304359,-0.206945


modeling

In [11]:
X = train.drop("Transported", axis=1)
y = train["Transported"]

xn = train_N.drop("Transported", axis=1)
yn = train_N["Transported"]


def Train_ML(X=None, y=None, test=None, model=None, xn=None, yn=None):

    if X is None or y is None:
        if xn is not None and yn is not None:
            X = xn
            y = yn
        else:
            raise ValueError("Either (X, y) or (xn, yn) must be provided")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

    train_accuracies = []
    test_accuracies = []
    test_probs = np.zeros(len(test))

    for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        train_proba = model.predict_proba(X_train)[:, 1]
        test_proba = model.predict_proba(X_test)[:, 1]

        train_pred = (train_proba >= 0.5).astype(int)
        test_pred = (test_proba >= 0.5).astype(int)

        train_acc = accuracy_score(y_train, train_pred)
        test_acc = accuracy_score(y_test, test_pred)

        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)

        test_probs += model.predict_proba(test)[:, 1] / n_splits

        print(
            f"Fold {fold}: Train Accuracy: {train_acc:.4f}, Test Accuracy: {test_acc:.4f}"
        )

    print("\n")
    print(f"__Mean Train Accuracy: {np.mean(train_accuracies):.4f}")
    print(f"__Mean Test Accuracy: {np.mean(test_accuracies):.4f}")

    return test_probs

In [12]:
"""Lb : 0.80196"""

Lp = {
    "learning_rate": 0.05255662080483515,
    "max_depth": 34,
    "reg_alpha": 2.226949211863629,
    "reg_lambda": 0.10506605571368621,
    "num_leaves": 39,
    "subsample": 0.22249538144765668,
    "colsample_bytree": 0.8736761308217752,
    "objective": "binary",
    "n_iter": 200,
    "boosting_type": "gbdt",
}


lm = LGBMClassifier(**Lp, verbose=-1, random_state=SEED)

print("---> LGB__0")
lpreds = Train_ML(X, y, test, lm)

---> LGB__0
Fold 1: Train Accuracy: 0.8851, Test Accuracy: 0.8123
Fold 2: Train Accuracy: 0.8885, Test Accuracy: 0.8123
Fold 3: Train Accuracy: 0.8901, Test Accuracy: 0.8280
Fold 4: Train Accuracy: 0.8893, Test Accuracy: 0.7994
Fold 5: Train Accuracy: 0.8873, Test Accuracy: 0.8142
Fold 6: Train Accuracy: 0.8898, Test Accuracy: 0.7974
Fold 7: Train Accuracy: 0.8892, Test Accuracy: 0.8177
Fold 8: Train Accuracy: 0.8908, Test Accuracy: 0.7956


__Mean Train Accuracy: 0.8888
__Mean Test Accuracy: 0.8096


In [13]:
"""LB : 0.80149"""

xp = {
    "learning_rate": 0.06516652353739706,
    "max_depth": 38,
    "min_child_weight": 1,
    "gamma": 0.008447929795037001,
    "subsample": 0.8844407112693309,
    "colsample_bytree": 0.36615109453559186,
    "lambda": 1.3602165377656108,
    "alpha": 8.743278468315916,
}

xm = XGBClassifier(
    **xp,
    verbose=0,
    random_state=SEED,
    objective="binary:logistic",
    eval_metric="logloss",
    n_estimators=200,
)
x_preds = Train_ML(X, y, test, xm)

Fold 1: Train Accuracy: 0.8512, Test Accuracy: 0.8151
Fold 2: Train Accuracy: 0.8513, Test Accuracy: 0.8215
Fold 3: Train Accuracy: 0.8518, Test Accuracy: 0.8206
Fold 4: Train Accuracy: 0.8566, Test Accuracy: 0.7994
Fold 5: Train Accuracy: 0.8524, Test Accuracy: 0.8086
Fold 6: Train Accuracy: 0.8515, Test Accuracy: 0.8066
Fold 7: Train Accuracy: 0.8504, Test Accuracy: 0.8214
Fold 8: Train Accuracy: 0.8538, Test Accuracy: 0.7947


__Mean Train Accuracy: 0.8524
__Mean Test Accuracy: 0.8110


In [14]:
"""LB : 0.80266"""

cp = {
    "n_estimators": 400,
    "learning_rate": 0.03467456378123331,
    "depth": 6,
    "random_strength": 2.625413605618935,
    "min_data_in_leaf": 88,
}


cm = CatBoostClassifier(**cp, verbose=0, random_state=SEED)

print("\n")
print("---> Cat__0")
c_preds = Train_ML(X, y, test, cm)



---> Cat__0
Fold 1: Train Accuracy: 0.8467, Test Accuracy: 0.8151
Fold 2: Train Accuracy: 0.8480, Test Accuracy: 0.8289
Fold 3: Train Accuracy: 0.8495, Test Accuracy: 0.8252
Fold 4: Train Accuracy: 0.8501, Test Accuracy: 0.8031
Fold 5: Train Accuracy: 0.8474, Test Accuracy: 0.8142
Fold 6: Train Accuracy: 0.8497, Test Accuracy: 0.8158
Fold 7: Train Accuracy: 0.8488, Test Accuracy: 0.8250
Fold 8: Train Accuracy: 0.8497, Test Accuracy: 0.8002


__Mean Train Accuracy: 0.8487
__Mean Test Accuracy: 0.8159
