## This notebook is supposed to be running in Google Colab. If you want to check the output, please download the ipynb file and run it in Google Colab. Testing accuracy: 79.7%

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM
%cd LightGBM
!mkdir build
%cd build
!cmake ../../LightGBM -DUSE_GPU=1
!make -j4
%cd ../python-package
!python3 setup.py install --gpu

In [None]:
cd ../..

In [None]:
!pip install kaggle --upgrade
# !pip install lightgbm
# !pip install lightgbm --install-option=--gpu
!pip install optuna

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c spaceship-titanic

In [None]:
!unzip spaceship-titanic.zip

In [None]:
import numpy as np
import pandas as pd

Load datasets

In [None]:
basepath = "/content/"
df_train = pd.read_csv(basepath + "train.csv")
df_test = pd.read_csv(basepath + "test.csv")

In [None]:
df_train.head()

# EDA and Feature Engineering

In [None]:
df_train.info()

In [None]:
df_train.describe().round(2)

In [None]:
df_train.shape

In [None]:
import seaborn as sns

In [None]:
# Separate numerical and categorical features
num_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_vars = [col for col in df_train.columns if col not in num_vars and col != 'Transported']

## Categorical features

In [None]:
# Check unique values for each categorical feature
df_train[cat_vars].nunique() 

From above we can see 'PassengerId', 'Cabin'. 'Name' these three columns contain thousands of unique values within each feature.

The simplist way is dropping all of them. However, from the description of the data we know there are patterns inside 'PassengerId' and 'Cabin' and we can try to extract useful information from these two columns.

*   PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

*   Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

From the definition above, **we can try to classify 'PassengerId' into groups by the first 'gggg' and 'Cabin' into 'deck', 'number' and 'side' three parts.**

'Name' column to be dropped as it does not provide more info other than name.



In [None]:
# Extract group info from 'PassengerId' and then drop 'PassengerId'
def group_extract(df):
  df['Group'] = df['PassengerId'].str[:4].astype('int') # Turn to integer type because format 'gggg' are four digit numbers
  return df

df_train = group_extract(df_train)
df_train = df_train.drop('PassengerId', axis=1)
df_test = group_extract(df_test)

In [None]:
# Extract deck/number/port from 'Cabin' and drop 'Cabin'
def dnp_extract(df):
  df['Deck'] = df['Cabin'].str[0]
  df['Deck_num'] = df['Cabin'].str[2].astype('float') # Here use float type as there are NaNs in 'Cabin' column
  df['Port'] = df['Cabin'].str[-1]
  df = df.drop('Cabin', axis=1)
  return df

df_train = dnp_extract(df_train)
df_test = dnp_extract(df_test)

In [None]:
# Drop 'Name' column
df_train = df_train.drop('Name', axis=1)
df_test = df_test.drop('Name', axis=1)

In [None]:
# Update categorical/numerical features
cat_vars = [i for i in cat_vars if i not in ['PassengerId', 'Cabin', 'Name']]

for i in ['Deck', 'Port']:
  cat_vars.append(i)

for i in ['Group', 'Deck_num']:
  num_vars.append(i)

In [None]:
cat_vars

In [None]:
# HomePlanet --> Transported relationship
sns.catplot(x='HomePlanet', hue='Transported', kind='count', data=df_train) # Passengers from Europa are more likely transported

In [None]:
# CryoSleep --> Transported
sns.catplot(x='CryoSleep', hue='Transported', kind='count', data=df_train) # Plot shows passengers who were in cryosleep had higher probability of being transported.

In [None]:
# Destination --> Transported
sns.catplot(x='Destination', hue='Transported', kind='count', data=df_train) # No obvious trend

In [None]:
# VIP --> Transported
sns.catplot(x='VIP', hue='Transported', kind='count', data=df_train) # No obvious trend

## Numerical features

The data description says "RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities." which indicates that 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa' and 'VRDeck' all belong to the amenities expense. Therefore, sum them up and create a new column 'Amenities' might be a good idea.

In [None]:
num_vars

In [None]:
# Create 'Amenities' by summing up 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' columns
df_train['Amenities'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']
df_test['Amenities'] = df_test['RoomService'] + df_test['FoodCourt'] + df_test['ShoppingMall'] + df_test['Spa'] + df_test['VRDeck']

In [None]:
# Update numerical features
num_vars.append('Amenities')

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Check numerical features' distributions
fig, ax = plt.subplots(3, 3, figsize=(12,8))

for ax, feature in zip(ax.ravel(), num_vars):
  sns.histplot(x=df_train[feature][df_train[feature] != 0], ax=ax, kde=True) # value 0 dropped as it's volume would affect the plot

fig.tight_layout()

We notice that there might be outliers existing in 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' and 'Amenities'.

In [None]:
# Check outliers
fig, ax = plt.subplots(3, 3, figsize=(12,8))

for ax, feature in zip(ax.ravel(), num_vars):
  sns.boxplot(x=df_train[feature], ax=ax) 

fig.tight_layout()

In [None]:
# Handle outliers
def outlier_handle(col, boundary):
  df_train.loc[df_train[col] > boundary, col] = df_train.loc[df_train[col] <= boundary, col].mean()
  df_test.loc[df_train[col] > boundary, col] = df_train.loc[df_train[col] <= boundary, col].mean()

outlier_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Amenities']
boundaries = [8000, 20000, 10000, 18000, 14000, 35000]

for col, boundary in zip(outlier_cols, boundaries):
  outlier_handle(col, boundary)

## Others

In [None]:
# Check target distribution
df_train.Transported.value_counts() # Nearly 50-50 which indicates no obvious imbalance

In [None]:
# Check duplicated rows
df_train.duplicated().any()

In [None]:
# Drop duplicates
df_train = df_train.drop_duplicates()

In [None]:
# Check missing values
df_train.isna().sum()

# Data pre-processing

In [None]:
num_vars.remove('Deck_num')
cat_vars.append('Deck_num')

In [None]:
# Impute missing values
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

df_train[num_vars] = num_imputer.fit_transform(df_train[num_vars])
df_train[cat_vars] = cat_imputer.fit_transform(df_train[cat_vars])

df_test[num_vars] = num_imputer.transform(df_test[num_vars])
df_test[cat_vars] = cat_imputer.transform(df_test[cat_vars])

In [None]:
# Encode categorical variables
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe_data = encoder.fit_transform(df_train[cat_vars])
ohe_data_test = encoder.transform(df_test[cat_vars])

ohe_colnames = encoder.get_feature_names(cat_vars)

df_train = pd.concat([df_train.drop(cat_vars, axis=1), pd.DataFrame(ohe_data, columns=ohe_colnames, index=df_train.index)], axis=1)
df_test = pd.concat([df_test.drop(cat_vars, axis=1), pd.DataFrame(ohe_data_test, columns=ohe_colnames, index=df_test.index)], axis=1)

In [None]:
df_train['Transported'].replace({True: 1, False: 0}, inplace=True)

## Check correlations among features

In [None]:
plt.figure(figsize=(15, 8))
num_cols = num_vars.copy()
num_cols.append('Transported')
sns.heatmap(df_train[num_cols].corr(), annot=True, vmin=-1, vmax=1)

As shown above, we can see that 'ShoppingMall', 'FoodCourt' and 'Group' have relatively low correlations to the target and 'ShoppingMall' has a hight correlation to 'Amenities'. Therefore, drop 'ShoppingMall', 'FoodCourt' and 'Group'.

In [None]:
df_train = df_train.drop(['ShoppingMall', 'FoodCourt', 'Group'], axis=1)
df_test = df_test.drop(['ShoppingMall', 'FoodCourt', 'Group'], axis=1)

## Split train/test dataset

In [None]:
# Create train, validation datasets
from sklearn.model_selection import train_test_split
X = df_train.drop('Transported', axis=1).copy()
y = df_train['Transported'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Modeling

### Light Gredient Boosted Machine(LGBMClassifier)

In [None]:
!pip install lightgbm

In [None]:
import lightgbm as lgb

In [None]:
# Baseline
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
# Hyperparameter tuning using optuna
import optuna
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from optuna.integration import LightGBMPruningCallback


def objective(trial, X, y):
    param_grid = {
        "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "boosting_type": trial.suggest_categorical("boosting_type", ['gbdt']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 20, 25000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 10000, step=100),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 100, step=5),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 100, step=5),
        "min_split_gain": trial.suggest_float("min_split_gain", 0, 50),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 1, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.1, 1, step=0.1
        ),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = lgb.LGBMClassifier(objective="binary", verbosity=-1,  **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            # early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss"),
                lgb.early_stopping(100)
            ],  # Add a pruning callback
        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [None]:
optuna.logging.set_verbosity(0)
import warnings
warnings.filterwarnings("ignore")

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=300)

In [None]:
study.best_value

In [None]:
# Check feature importance
from optuna.visualization.matplotlib import plot_param_importances

plot_param_importances(study)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

best_params = study.best_params
gbm = lgb.LGBMClassifier(objective="binary", verbosity=-1, **best_params)
gbm.fit(X_train, y_train)

In [None]:
y_pred = gbm.predict(X_test)
print(metrics.classification_report(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

## Make predictions and produce submit file

In [None]:
df_test

In [None]:
# Drop useless columns
idx = df_test['PassengerId']
df_test = df_test.drop('PassengerId', axis=1)

In [None]:
def submit(model):
  y_test = model.predict(df_test)
  y_test = pd.Series(y_test, name='Transported')
  y_test = y_test.astype('bool')
  df_submit = pd.concat([idx, y_test], axis=1)
  df_submit.to_csv('/content/submission.csv', index=False)

  !kaggle competitions submit -c spaceship-titanic -f submission.csv -m "Message"
  
  return

In [None]:
submit(gbm)