![](https://hdwallpaperim.com/wp-content/uploads/2017/08/22/435641-ultra-wide-space.jpg)

# Quick Decisions : Space Titanic Competition

In this notebook the main focus is to check to and take direct descisions to prepare a started submission.

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Setting colour palette for visualizations

In [None]:
sns.set_palette("viridis_r")

## Datapaths

In [None]:
train_base_path = "../input/spaceship-titanic/train.csv"
test_base_path = "../input/spaceship-titanic/test.csv"
samp_base_path = "../input/spaceship-titanic/sample_submission.csv"

## Loading the data

In [None]:
train_df = pd.read_csv(train_base_path)
train_df.head()

In [None]:
test_df = pd.read_csv(test_base_path)
test_df.head()

In [None]:
samp_df = pd.read_csv(samp_base_path)
samp_df.head()

## Exploratory Data Analysis & feature Processing / Engineering

### Train Data Overview

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.describe()

#### Data sparsity in train and test set on Age column

In [None]:
sns.histplot(train_df["Age"], label = "Training Set")
sns.histplot(test_df["Age"], label = "Testing Set")
plt.legend()
plt.title("Age Distribution", size=20)
plt.show()

### Splitting **cabin** column

In [None]:
train_df["Cabin"].fillna("none/0/none", inplace = True)
train_df["deck"] = train_df["Cabin"].apply(lambda x: str(x).split("/")[0])
train_df["num"] = train_df["Cabin"].apply(lambda x: str(x).split("/")[1])
train_df["side"] = train_df["Cabin"].apply(lambda x: str(x).split("/")[2])
train_df.drop("Cabin", 1 , inplace=True)
train_df.head()

In [None]:
test_df["Cabin"].fillna("none/0/none", inplace = True)
test_df["deck"] = test_df["Cabin"].apply(lambda x: str(x).split("/")[0])
test_df["num"] = test_df["Cabin"].apply(lambda x: str(x).split("/")[1])
test_df["side"] = test_df["Cabin"].apply(lambda x: str(x).split("/")[2])
test_df.drop("Cabin", 1 , inplace=True)
test_df.head()

### Fetching passenger groups

In [None]:
train_df["group"] = train_df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
train_df.head()

In [None]:
test_df["group"] = test_df["PassengerId"].apply(lambda x: int(x.split("_")[0]))
test_df.head()

### Correlation of the purchases of the passengers

In [None]:
purchase_features = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

In [None]:
for fet in purchase_features:
    print(f"[{fet} : Train-Set] Max Value : {max(train_df[fet])} || Min Value : {min(train_df[fet])}")
    print(f"[{fet} : Test-Set] Max Value : {max(test_df[fet])} || Min Value : {min(test_df[fet])}")

### Scaling feature variables :

In [None]:
tr_corr = train_df[purchase_features].corr()
plt.figure(figsize = (6, 6))
sns.heatmap(tr_corr)
plt.show()

In [None]:
ts_corr = test_df[purchase_features].corr()
plt.figure(figsize = (6, 6))
sns.heatmap(ts_corr)
plt.show()

## Data Distribution among the onboad passenger purchases by their target feature

#### SPA

In [None]:
sns.lineplot(x = train_df.index, y = train_df.Spa, hue = train_df.Transported)
plt.show()

#### RoomService

In [None]:
sns.lineplot(x = train_df.index, y = train_df.RoomService, hue = train_df.Transported)
plt.show()

#### FoodCourt

In [None]:
sns.lineplot(x = train_df.index, y = train_df.FoodCourt, hue = train_df.Transported)
plt.show()

#### ShoppingMall

In [None]:
sns.lineplot(x = train_df.index, y = train_df.ShoppingMall, hue = train_df.Transported)
plt.show()

#### VRDeck

In [None]:
sns.lineplot(x = train_df.index, y = train_df.VRDeck, hue = train_df.Transported)
plt.show()

In [None]:
fet_y = train_df.FoodCourt + train_df.RoomService + train_df.ShoppingMall + train_df.Spa + train_df.VRDeck
sns.lineplot(x = train_df.index, y = fet_y, hue = train_df.Transported)
plt.show()

#### Checking the same on sum of total purchases (Not adding shopping Mall cause it was giving almost same distribution for both negative and positive targets)

In [None]:
fet_y = train_df.FoodCourt + train_df.RoomService + train_df.Spa + train_df.VRDeck
sns.lineplot(x = train_df.index, y = fet_y, hue = train_df.Transported)
plt.show()

### As ShoppingMall purchases were more over similar for both cases , it can have less contribution in training.

In [None]:
train_df.drop("ShoppingMall", 1, inplace = True)
test_df.drop("ShoppingMall", 1, inplace = True)

### Checking target values true positive rate on different feature

#### HomePlanet

In [None]:
train_df.groupby("HomePlanet").sum()["Transported"] / train_df.groupby("HomePlanet").count()["Transported"]

#### Cryosleep

In [None]:
train_df.groupby("CryoSleep").sum()["Transported"] / train_df.groupby("CryoSleep").count()["Transported"]

#### Age

In [None]:
data = train_df.groupby("Age").sum()["Transported"] / train_df.groupby("Age").count()["Transported"]
data

In [None]:
sns.lineplot(data.index, data.values)
plt.ylabel("True Positive")
plt.title("True positive rate on Age")
plt.show()

#### Deck

In [None]:
train_df.groupby("deck").sum()["Transported"] / train_df.groupby("deck").count()["Transported"]

#### Side

In [None]:
train_df.groupby("side").sum()["Transported"] / train_df.groupby("side").count()["Transported"]

#### Checking the same combining deck and side

In [None]:
train_df.groupby(["deck", "side"]).sum()["Transported"] / train_df.groupby(["deck", "side"]).count()["Transported"]

#### VIP

In [None]:
train_df.groupby("VIP").sum()["Transported"] / train_df.groupby("VIP").count()["Transported"]

#### Num

In [None]:
train_df.groupby("num").sum()["Transported"] / train_df.groupby("num").count()["Transported"]

#### Destination

In [None]:
train_df.groupby("Destination").sum()["Transported"] / train_df.groupby("Destination").count()["Transported"]

#### Checking the processed training set

In [None]:
train_df.head()

### OneHot Encoding for categorical fature with less number of unique items

In [None]:
planet = pd.get_dummies(train_df["HomePlanet"])
dest = pd.get_dummies(train_df["Destination"])
train_df = pd.concat([train_df, planet, dest], axis = 1)
train_df.drop(["HomePlanet", "Destination"], axis=1, inplace = True)
train_df.head()

In [None]:
planet = pd.get_dummies(test_df["HomePlanet"])
dest = pd.get_dummies(test_df["Destination"])
test_df = pd.concat([test_df, planet, dest], axis = 1)
test_df.drop(["HomePlanet", "Destination"], axis=1, inplace = True)
test_df.head()

### Processing on boolean features

In [None]:
for col in ["CryoSleep", "VIP", "Transported"]:
    train_df[col].fillna(False, inplace = True)
    try:
        test_df[col].fillna(False, inplace = True)
    except:
        pass
    train_df[col] = train_df[col].apply(lambda x: int(x))
    try:
        test_df[col] = test_df[col].apply(lambda x: int(x))
    except:
        pass
train_df.head()

### Performing label encoding for some categorical features

In [None]:
train_df["num"] = train_df["num"].apply(lambda x: int(x))
test_df["num"] = test_df["num"].apply(lambda x: int(x))
for col in ["deck", "side"]:
    for data in [train_df, test_df]:
        unique_item = reversed(data[col].value_counts().index)
        for index, item in enumerate(unique_item):
            data[col].replace(item, int(index), inplace = True)
train_df.head()

### Filling Null spaces in integer features with their mean

In [None]:
for col in train_df.columns:
    if train_df[col].dtype != "object":
        train_df[col].fillna(np.mean(train_df[col]), inplace = True)
        try:
            test_df.fillna(np.mean(test_df[col]), inplace = True)
        except:
            pass
train_df.head()

### Choosing training set features

In [None]:
x_features = train_df.columns.tolist()
x_features.remove("PassengerId")
x_features.remove("Transported")
x_features.remove("Name")
x_features

### Preparing training set and target features

In [None]:
train_df = train_df.sample(frac=1)
X = train_df[x_features]
y = train_df["Transported"]

test = test_df[x_features]

X.shape, y.shape, test.shape

#### validating whether any null value is prsent

In [None]:
X.isna().sum()

In [None]:
test.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split

#### Performing train val split on 80-20 ratio

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

### Importing model and tuning modules

In [None]:
from sklearn.model_selection import GridSearchCV as gcv

In [None]:
from xgboost import XGBClassifier as xgb
from sklearn.ensemble import RandomForestClassifier as rfc
from lightgbm import LGBMClassifier as lgb

### dry run

In [None]:
%%time
samp_xgb_model = xgb(verbosity=2)
samp_xgb_model.fit(X_train, y_train)
print(f" Training accuracy score : {samp_xgb_model.score(X_train, y_train)}")
print(f" Training accuracy score : {samp_xgb_model.score(X_val, y_val)}")

In [None]:
%%time
samp_rfc_model = rfc(random_state=42)
samp_rfc_model.fit(X_train, y_train)
print(f" Training accuracy score : {samp_rfc_model.score(X_train, y_train)}")
print(f" Training accuracy score : {samp_rfc_model.score(X_val, y_val)}")

In [None]:
%%time
samp_lgb_model = lgb(verbosity=2)
samp_lgb_model.fit(X_train, y_train)
print(f" Training accuracy score : {samp_lgb_model.score(X_train, y_train)}")
print(f" Training accuracy score : {samp_lgb_model.score(X_val, y_val)}")

### Fixing final models

## XGBoost Classifier

In [None]:
xgb_model = xgb(verbosity=0)
xgb_model

### Tuning final model with parameters

In [None]:
xgb_params = {
    "n_estimators" : [60, 80, 100],
    "learning_rate" : [0.1, 0.15, 0.17, 0.2],
    "reg_alpha" : [0, 0.1],
    "reg_lambda" : [0, 0.1]
}

xgb_model = gcv(xgb_model, xgb_params)
xgb_model

## RandomForestClassifier

In [None]:
rfc_model = rfc(random_state=42)
rfc_model

In [None]:
rfc_params = {
    "n_estimators" : [50, 60, 70, 80, 90, 100],
    "min_samples_leaf" : [1, 3, 5],
    "max_features" : ["auto", "log2"]
}
rfc_model = gcv(rfc_model, rfc_params)
rfc_model

## LightGBM Classifier

In [None]:
lgb_model = lgb()
lgb_model

In [None]:
lgb_params = {
    "n_estimators": [60, 80, 100],
    "learning_rate" : [0, 0.1, 0.2],
    "min_split_gain" : [0, 0.01, 0.02]
}
lgb_model = gcv(lgb_model, lgb_params)
lgb_model

## Model Training & Metric Validation

In [None]:
def metric_validation(model, filename):
    training_results = pd.DataFrame(model.cv_results_)
    training_results.to_csv(f"{filename}.csv", index = False)
    print(f"Best params : {model.best_params_}")
    print(f"Training Accuracy Score : {model.score(X_train, y_train)}")
    print(f"Validation Accuracy Score : {model.score(X_val, y_val)}")
    return training_results

### XGB Classifier : 

In [None]:
xgb_model.fit(X_train, y_train)
xgb_training_results = metric_validation(xgb_model, "xgb_training")
xgb_training_results.head(2).T

### Random Forest Classifier :

In [None]:
rfc_model.fit(X_train, y_train)
rfc_training_results = metric_validation(rfc_model, "rfc_training")
rfc_training_results.head(2).T

### LightGBM Classifier :

In [None]:
lgb_model.fit(X_train, y_train)
lgb_training_results = metric_validation(lgb_model, "lgb_training")
lgb_training_results.head(2).T

## Submission Preparation

#### Model Prediction

In [None]:
def predict_fn(model, filename):
    pred = model.predict(test)
    op_df = pd.DataFrame(
        {
            "PassengerId" : test_df.PassengerId,
            "Transported" : pred
        }
    )
    op_df["Transported"] = op_df['Transported'].apply(lambda x: bool(x))
    op_df.to_csv(f"{filename}.csv", index = False)
    return op_df

In [None]:
xgb_pred = predict_fn(xgb_model, "xgb")
xgb_pred.head().T

In [None]:
rfc_pred = predict_fn(rfc_model, "rfc")
rfc_pred.head().T

In [None]:
lgb_pred = predict_fn(lgb_model, "lgb")
lgb_pred.head().T

### Sample Model Predictions

In [None]:
samp_xgb_pred = predict_fn(samp_xgb_model, "samp_xgb")
samp_xgb_pred.head().T

In [None]:
samp_rfc_pred = predict_fn(samp_rfc_model, "samp_rfc")
samp_rfc_pred.head().T

In [None]:
samp_lgb_pred = predict_fn(samp_lgb_model, "samp_lgb")
samp_lgb_pred.head().T