In [3]:
# Import libraries and load data
import pandas as pd
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
# Drop missing values
df_train.dropna(axis=0, how='any', inplace=True)
print('Train missing values:', (df_train.isnull().mean() * 100).sort_values(ascending=False))

Train missing values: PassengerId     0.0
HomePlanet      0.0
CryoSleep       0.0
Cabin           0.0
Destination     0.0
Age             0.0
VIP             0.0
RoomService     0.0
FoodCourt       0.0
ShoppingMall    0.0
Spa             0.0
VRDeck          0.0
Name            0.0
Transported     0.0
dtype: float64


In [5]:
# Define train_number and train_text before imputation
train_number = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_text = [col for col in df_train.columns if col not in train_number and col != 'Transported']

# Impute missing values instead of dropping (optional, for robustness)
from sklearn.impute import SimpleImputer
import numpy as np

# Impute numerical columns
num_imputer = SimpleImputer(strategy='median')
for col in train_number:
    df_train[col] = num_imputer.fit_transform(df_train[[col]]).ravel()
    df_test[col] = num_imputer.transform(df_test[[col]]).ravel()

# Impute categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
for col in train_text:
    if col in df_train.columns:
        df_train[col] = cat_imputer.fit_transform(df_train[[col]]).ravel()
    if col in df_test.columns:
        df_test[col] = cat_imputer.transform(df_test[[col]]).ravel()

In [6]:
# Identify numerical and text columns
train_number = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train_text = df_train.drop(columns=train_number, axis=1).columns.tolist()
print('Text columns:', train_text)

Text columns: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'Name', 'Transported']


In [7]:
# Confirm columns and fix encoding/scaling for test set
print('Train columns:', df_train.columns.tolist())
print('Test columns:', df_test.columns.tolist())

# Remove target and PassengerId from train_text if present
for col_to_remove in ['Transported', 'PassengerId']:
    if col_to_remove in train_text:
        train_text.remove(col_to_remove)

# Encode categorical columns using LabelEncoder (fit on train, transform on test)
from sklearn.preprocessing import LabelEncoder
for col in train_text:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    if col in df_test.columns:
        # For unseen labels in test, set them to -1
        df_test[col] = df_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

Train columns: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported']
Test columns: ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']


In [8]:
# Clean text columns in both dataframes
pattern = r'["\'!?\\*,;<>/()]+'
df_train[train_text] = df_train[train_text].astype(str).apply(lambda x: x.str.replace(pattern, '', regex=True))
df_test[train_text] = df_test[train_text].astype(str).apply(lambda x: x.str.replace(pattern, '', regex=True))

In [9]:
# Split 'Cabin' into 'Deck', 'Num', 'City' in both dataframes
cabin_split_train = df_train['Cabin'].astype(str).str.split('/', n=2, expand=True)
while cabin_split_train.shape[1] < 3:
    cabin_split_train[cabin_split_train.shape[1]] = None
df_train[['Deck', 'Num', 'City']] = cabin_split_train.iloc[:, 0:3]
df_train.drop(columns=['Cabin'], inplace=True)

cabin_split_test = df_test['Cabin'].astype(str).str.split('/', n=2, expand=True)
while cabin_split_test.shape[1] < 3:
    cabin_split_test[cabin_split_test.shape[1]] = None
df_test[['Deck', 'Num', 'City']] = cabin_split_test.iloc[:, 0:3]
df_test.drop(columns=['Cabin'], inplace=True)

if 'Cabin' in train_text:
    train_text.remove('Cabin')
train_text.extend(['Deck', 'Num', 'City'])

In [10]:
# Create Age_Group in both dataframes
bins = [0, 20, 40, 60, 100]
labels = ['0-20', '21-40', '41-60', '61+']
df_train['Age'] = pd.to_numeric(df_train['Age'], errors='coerce')
df_test['Age'] = pd.to_numeric(df_test['Age'], errors='coerce')
df_train['Age_Group'] = pd.cut(df_train['Age'], bins=bins, labels=labels, right=False)
df_test['Age_Group'] = pd.cut(df_test['Age'], bins=bins, labels=labels, right=False)
train_text.extend(['Age_Group'])

In [11]:
# Drop 'Age' and 'Name' from both dataframes
df_train.drop(columns=['Age', 'Name'], inplace=True)
df_test.drop(columns=['Age', 'Name'], inplace=True)
train_text = [item for item in train_text if item not in ['Age', 'Name']]

In [12]:
# Encode categorical columns using LabelEncoder
from sklearn.preprocessing import LabelEncoder
for col in train_text:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    # Map test labels to train labels, unseen labels get -1
    df_test[col] = df_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

In [13]:
# Scale numerical columns using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for col in train_number:
    df_train[col] = scaler.fit_transform(df_train[[col]])
    df_test[col] = scaler.transform(df_test[[col]])

In [14]:
# Split train dataframe into X and y, then train/test split
from sklearn.model_selection import train_test_split

X = df_train.drop('Transported', axis=1)
y = df_train['Transported']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [15]:
import sys
print(sys.executable)

C:\Users\Savero Madajaya\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe


In [18]:
from lazypredict.Supervised import LazyClassifier

# Drop PassengerId, cast everything to float
X_train_num = X_train.drop(columns=["PassengerId"]).astype(float)
X_val_num = X_val.drop(columns=["PassengerId"]).astype(float)

# Make sure target is int
y_train = y_train.astype(int)
y_val = y_val.astype(int)

# Run LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train_num, X_val_num, y_train, y_val)

# Show results
import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
print(models)


  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 2661, number of negative: 2623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 5284, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503596 -> initscore=0.014383
[LightGBM] [Info] Start training from score 0.014383
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.79               0.79     0.79      0.79   
CalibratedClassifierCV             0.79               0.79     0.79      0.79   
LinearSVC                          0.79               0.79     0.79      0.79   
LogisticRegression                 0.79   

In [19]:
from lightgbm import LGBMClassifier
# Train LightGBM classifier
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_num, y_train)

# Evaluate on validation set
val_pred = lgbm.predict(X_val_num)
val_acc = (val_pred == y_val).mean()
print(f"Validation Accuracy: {val_acc:.4f}")

[LightGBM] [Info] Number of positive: 2661, number of negative: 2623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 5284, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503596 -> initscore=0.014383
[LightGBM] [Info] Start training from score 0.014383
Validation Accuracy: 0.7920


In [20]:
#Retrain on full training data (X, y) before submission
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

X_full_num = X_full.drop(columns=["PassengerId"]).astype(float)
y_full = y_full.astype(int)

lgbm.fit(X_full_num, y_full)


[LightGBM] [Info] Number of positive: 3327, number of negative: 3279
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 6606, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503633 -> initscore=0.014532
[LightGBM] [Info] Start training from score 0.014532


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [21]:
# Predict on test set
X_test_num = df_test.drop(columns=["PassengerId"]).astype(float)
y_test_pred = lgbm.predict(X_test_num)

In [22]:
# Submission format
submission = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Transported": y_test_pred.astype(bool) 
})
submission.to_csv("submission.csv", index=False)