In [None]:
# installing necessary libraries
from IPython.display import clear_output
!pip3 install -U lazypredict
!pip3 install -U pandas #Upgrading pandas

clear_output()

In [None]:
!pip install scikit-learn -U

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import optuna 
from optuna import integration
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support
from sklearn.metrics import classification_report
from lazypredict.Supervised import LazyClassifier
from scipy import stats

import os
import warnings
warnings.filterwarnings("ignore")

# setting plot themes and colour scheme
plt.style.use('ggplot')
sns.set_palette('tab10')

In [None]:
# loading the data
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
# sample_sub = pd.read_csv('./data/sample_submission.csv')

In [None]:
# printing dimensions of the train, test and sample submission
print(f'Train: {train.shape}')
print(f'Test: {test.shape}')
# print(f'Sample Submission: {sample_sub.shape}')

In [None]:
# looking at the train dataset
train.head(3)

In [None]:
# looking at the test dataset
test.head()

In [None]:
# info about the train and test data
train.info()

In [None]:
test.info()

In [None]:
# basic statistics of train data
train.describe()

In [None]:
# basic statistics of test data
test.describe()

#### Null Values

In [None]:
# null values in train data
train.isnull().sum().sort_values(ascending=False)

In [None]:
# null values in test data
test.isnull().sum().sort_values(ascending=False)

### Duplicates

In [None]:
train_duplicates = train.duplicated().sum()
test_duplicates = test.duplicated().sum()

print(f'Duplicates in Training Set: {train_duplicates} ({train_duplicates / train.shape[1]:.1%})')
print(f'Duplicates in Test Set: {test_duplicates} ({test_duplicates / test.shape[1]:.1%})')

### Cardinality of Features

In [None]:
print('Unique values of Features in Training Set:')
print(train.nunique())
print('\nUnique values of Features in Test Set:')
print(test.nunique())

## EDA and Feature Engineering

### Checking the Distribution of the Features

In [None]:
# looking at the columns
list(train)

***HomePlanet***

In [None]:
# checking for null values
train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum()

In [None]:
# checking the distribution of the feature
train['HomePlanet'].value_counts()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='HomePlanet', data=train.fillna('Missing'))
plt.title('Home Planet of Passengers')
plt.show();

- We can see that Earth has the most number of passengers, almost more than double than the others.
- We also have 201 missing values which need to be taken care of.

***CryoSleep***

In [None]:
# checking null values
train['CryoSleep'].isna().sum(), test['CryoSleep'].isna().sum()

In [None]:
# checking sitribution
train['CryoSleep'].value_counts()

In [None]:
plt.figure()
sns.countplot(x='CryoSleep', data=train.fillna('Missing'))
plt.title('CryoSleep of Passengers')
plt.show();

***Cabin***

In [None]:
# checking null values for cabin
train['Cabin'].isna().sum()

In [None]:
train['Cabin'].value_counts()

- Can be divided into:
    - deck, the first letter in the cabin feature
    - side, the last letter in the cabin feature


***Destination***

In [None]:
train['Destination'].isna().sum(), test['Destination'].isna().sum()

In [None]:
train['Destination'].value_counts()

In [None]:
plt.figure()
sns.countplot(x='Destination', data=train.fillna('Missing'))
plt.title('Destination of Passengers')
plt.show();

***Age***

In [None]:
# statistics
train['Age'].describe()

In [None]:
# checking the distribution
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
sns.histplot(x='Age', data=train, kde=True, hue="Transported", ax=ax[0])
sns.boxplot(x='Age', data=train, ax=ax[1])
fig.suptitle('Age Distribution', fontsize=16)
plt.show();

***VIP***

In [None]:
# checking null values
train['VIP'].isna().sum()

In [None]:
# checking distribution
train['VIP'].value_counts()

In [None]:
plt.figure()
sns.countplot(x='VIP', data=train)
plt.title('Whether passenger is a VIP or not?')
plt.show();

- There are only a very few passengers who are VIP.

***RoomService - Amount Spent***

In [None]:
# statistics
train['RoomService'].describe()

***Expenditure Features:***

In [None]:
fig, axs = plt.subplots(2, 3, sharey=True, figsize=(10, 7))
sns.kdeplot(x='RoomService', data=train, ax=axs[0][0])
sns.kdeplot(x='FoodCourt', data=train, ax=axs[0][1])
sns.kdeplot(x='ShoppingMall', data=train, ax=axs[0][2])
sns.kdeplot(x='Spa', data=train, ax=axs[1][0])
sns.kdeplot(x='VRDeck', data=train, ax=axs[1][1])

# access each axes object via axs.flat
for ax in axs.flat:
    # check if something was plotted 
    if not bool(ax.has_data()):
        fig.delaxes(ax) # delete if nothing is plotted in the axes obj

fig.suptitle('Distribution of all Expenditure Features')
plt.tight_layout()
plt.show();

In [None]:
# checking to see expenditures are made by VIPs or Non-VIPs:
fig, axs = plt.subplots(2, 3, sharey=True, figsize=(10, 7))
sns.kdeplot(x='RoomService', data=train[train['VIP'] == 0], ax=axs[0][0], label='Non-VIP', fill=True, alpha=0.5)
sns.kdeplot(x='FoodCourt', data=train[train['VIP'] == 0], ax=axs[0][1], label='Non-VIP', fill=True, alpha=0.5)
sns.kdeplot(x='ShoppingMall', data=train[train['VIP'] == 0], ax=axs[0][2], label='Non-VIP', fill=True, alpha=0.5)
sns.kdeplot(x='Spa', data=train[train['VIP'] == 0], ax=axs[1][0], label='Non-VIP', fill=True, alpha=0.5)
sns.kdeplot(x='VRDeck', data=train[train['VIP'] == 0], ax=axs[1][1], label='Non-VIP', fill=True, alpha=0.5)

sns.kdeplot(x='RoomService', data=train[train['VIP'] == 1], ax=axs[0][0], label='VIP', fill=True, alpha=0.5)
sns.kdeplot(x='FoodCourt', data=train[train['VIP'] == 1], ax=axs[0][1], label='VIP', fill=True, alpha=0.5)
sns.kdeplot(x='ShoppingMall', data=train[train['VIP'] == 1], ax=axs[0][2], label='VIP', fill=True, alpha=0.5)
sns.kdeplot(x='Spa', data=train[train['VIP'] == 1], ax=axs[1][0], label='VIP', fill=True, alpha=0.5)
sns.kdeplot(x='VRDeck', data=train[train['VIP'] == 1], ax=axs[1][1], label='VIP', fill=True, alpha=0.5)

# access each axes object via axs.flat
for ax in axs.flat:
    # check if something was plotted 
    if not bool(ax.has_data()):
        fig.delaxes(ax) # delete if nothing is plotted in the axes obj

fig.suptitle('Distribution Expenditures of VIPs and Non-VIPS')
lines_labels = axs[0][1].get_legend_handles_labels()
lines, labels = [sum(lol, []) for lol in zip(lines_labels)]
fig.legend(lines, labels)
plt.tight_layout()
plt.show();

***Splitting Cabin into two features: Deck and Side.***

In [None]:
train[['Deck', 'Side']] = train['Cabin'].str.split('/', expand=True).loc[:][[0,2]]
test[['Deck', 'Side']] = test['Cabin'].str.split('/', expand=True).loc[:][[0, 2]]

In [None]:
# plotting the count of Deck
plt.figure()
sns.countplot(x='Deck', hue='Transported', data=train)
plt.title('Number of Passengers in each Deck')
plt.show();

In [None]:
# plotting the count of Sides
plt.figure()
sns.countplot(x='Side', hue='Transported', data=train)
plt.title('Number of Passengers on each Side')
plt.show();

***Splitting the PassengerID into: Groups and No. of Passengers:***

In [None]:
# feature: Group
train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

In [None]:
# feature GroupSize
train['GroupSize'] = train['Group'].map(lambda x: pd.concat([train['Group'], test['Group']]).value_counts()[x])
test['GroupSize'] = test['Group'].map(lambda x: pd.concat([train['Group'], test['Group']]).value_counts()[x])

In [None]:
# plotting groupsize
plt.figure()
sns.countplot(x='GroupSize', hue='Transported', data=train)
plt.show();

***We can create another feature - Travelling Solo***

In [None]:
# 1 - Solo, 0 - not travelling solo
train['Solo'] = np.where(train['GroupSize'] == 1, 1, 0)
test['Solo'] = np.where(test['GroupSize'] == 1, 1, 0)

In [None]:
# plotting distribution of passengers travelling solo
plt.figure(figsize=(10, 7))
sns.countplot(x='Solo', data=train, hue='Transported')
plt.xticks([0, 1], ['Not Solo', 'Solo'])
plt.title('Passengers Travelling Solo or Not')
plt.xlabel('')
plt.show();

***Binning the Age Feature***

In [None]:
labels= ['0-18', '19-30', '30-50', '>50']
bins = [0, 18, 30, 50, 79]

train['Age_bins'] = pd.cut(train['Age'], bins=bins, labels=labels)
test['Age_bins'] = pd.cut(test['Age'], bins=bins, labels=labels)

In [None]:
# plotting the age bins
plt.figure(figsize=(10, 7))
sns.countplot(x='Age_bins', data=train, hue='Transported')
plt.title('Age Distribution - People Transported')
plt.xlabel('Age Groups')
plt.ylabel('Count')
plt.show();

***Expenditure - Whether Person spent money or not.***

In [None]:
# calculating the total expenditure

train['TotalExpenditure'] = train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']
test['TotalExpenditure'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa'] + test['VRDeck']

In [None]:
# now creating the feature

train['DidSpend'] = train['TotalExpenditure'].apply(lambda x: 0 if x == 0 else 1)
test['DidSpend'] = test['TotalExpenditure'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
# plotting the distribution
plt.figure(figsize=(10, 7))
sns.countplot(x='DidSpend', data=train, hue='Transported')
plt.xticks([0, 1], ['Did Not Spend', 'Did Spend'])
plt.xlabel('')
plt.title('Whether Person Spent any Money or Not')
plt.show();

- We gain the insight that, majority of people who 'Did Not spend any Money' were transported.
- Whereas majority of the people who 'Spent Money' were not transported.

In [None]:
# Lets see the histogram
plt.figure(figsize=(10, 7))
sns.kdeplot(x='TotalExpenditure', data=train[train['DidSpend'] == 1], hue='Transported')
plt.title('Distribution of Expenditure (Excluding 0)')
plt.xlabel('Total Expenditure')
plt.show();

In [None]:
plt.figure(figsize=(10, 7))
sns.heatmap(train.corr(), annot=True)
plt.title('Correlation Matrix')
plt.show();

In [None]:
train.columns

## Missing Values

In [None]:
# null values 
train.isna().sum()

In [None]:
test.isna().sum()

**Home Planet:**
- We can either treat, the NULL values as a category itself.
- We can check for patterns within missing data - Joint Distribution of Features.

In [None]:
# treating NULL values as a new category.
train['HomePlanet'] = train['HomePlanet'].apply(lambda x: 'NA' if x is np.nan else x)
test['HomePlanet'] = test['HomePlanet'].apply(lambda x: 'NA' if x is np.nan else x)

In [None]:
train['HomePlanet'].isna().sum(), test['HomePlanet'].isna().sum()

***Categorical Features: CryoSleep, Destination, VIP, Deck, Side***
- We will start with the filling the missing values in these features with the Most Frequent value ie. MODE

In [None]:
cat_features = ['CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(train[cat_features])

# transforming the cols
train[cat_features] = imputer.transform(train[cat_features])
test[cat_features] = imputer.transform(test[cat_features])

***Numerical Features: Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck***

In [None]:
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
imputer = SimpleImputer(strategy='median')
imputer.fit(train[numerical_features])

# transforming the numerical features
train[numerical_features] = imputer.transform(train[numerical_features])
test[numerical_features] = imputer.transform(test[numerical_features])

***Filling the Age Bins after imputing missing values in AGE feature:***

In [None]:
labels= ['0-18', '19-30', '30-50', '>50']
bins = [-1, 18, 30, 50, 79]

train['Age_bins'] = pd.cut(train['Age'], bins=bins, labels=labels)
test['Age_bins'] = pd.cut(test['Age'], bins=bins, labels=labels)

***Filling Total Expenditure and DidSpend after filling the Expenditure Features:***

In [None]:
# calculating the total expenditure
train['TotalExpenditure'] = train['RoomService'] + train['FoodCourt'] + train['ShoppingMall'] + train['Spa'] + train['VRDeck']
test['TotalExpenditure'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa'] + test['VRDeck']

# now creating the feature
train['DidSpend'] = train['TotalExpenditure'].apply(lambda x: 0 if x == 0 else 1)
test['DidSpend'] = test['TotalExpenditure'].apply(lambda x: 0 if x == 0 else 1)

***Dropping Features:***
- PassengerId
- Name
- Cabin

In [None]:
train.drop(['Name', 'Cabin', 'PassengerId', 'Group'], axis=1, inplace=True)
test.drop(['Name', 'Cabin', 'Group'], axis=1, inplace=True)

## Preprocessing

In [None]:
train.columns

In [None]:
train.nunique()

In [None]:
# plotting the distribution and log-transform of numerical features
num_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalExpenditure']

plt.figure(figsize=(15, 25))
for i, col in enumerate(num_cols):
    if col == 'Age':
        continue
    plt.subplot(6, 2, 2*i + 1)
    sns.kdeplot(x=col, data=train, color='yellow')
    plt.title(f'{col}')
    
    plt.subplot(6, 2, 2*i + 2)
    sns.kdeplot(stats.boxcox(1 + train[col])[0], color='green')
    plt.title(f'{col} (log-transform)')

plt.tight_layout()
plt.show();

***Applying the Boxcox transformation to the numerical features:***

In [None]:
for col in num_cols:
    train[col] = stats.boxcox(1 + train[col])[0]
    test[col] = stats.boxcox(1 + test[col])[0]

***Scaling the numerical features:***

In [None]:
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalExpenditure']

# standardscaler
sc = StandardScaler()
sc.fit(train[numerical_features])

# transforming the features
train[numerical_features] = sc.transform(train[numerical_features])
test[numerical_features] = sc.transform(test[numerical_features])

***Applying Encoding to binary features:***

In [None]:
# ordinal encoding to for binary features:
binary_cols = ['CryoSleep', 'VIP', 'Side', 'Solo', 'DidSpend']

In [None]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(train[binary_cols])

# transforming the features
train[binary_cols] = ordinal_encoder.transform(train[binary_cols])
test[binary_cols] = ordinal_encoder.transform(test[binary_cols])

***Applying OneHotEncoding to categorical features:***

In [None]:
# onehotencoding for categorical features
cat_cols = ['HomePlanet', 'Destination', 'Deck', 'Age_bins']

In [None]:
train_onehot = pd.get_dummies(train[cat_cols], drop_first=True)
test_onehot = pd.get_dummies(test[cat_cols], drop_first=True)

In [None]:
# attaching the onehotencoded features
train = pd.concat([train, train_onehot], axis=1)
test = pd.concat([test, test_onehot], axis=1)

In [None]:
# dropping the original columns
train.drop(cat_cols, axis=1, inplace=True)
test.drop(cat_cols, axis=1, inplace=True)

***Applying Label Encoding to the Target Variable***

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(train['Transported'])
train['Transported'] = label_encoder.transform(train['Transported'])

## Modeling

### Splitting the Train data - Train and Validation Sets

In [None]:
train.columns, train.shape

In [None]:
X_train = train.drop(['Transported'], axis=1)
y_train = train[['Transported']]

In [None]:
X_train.shape, y_train.shape

In [None]:
# now splitting into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train,
                                                     random_state=42)

In [None]:
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

### Modeling using LazyPredict - To get an Overview

In [None]:
# using lazypredict classifier
clf = LazyClassifier(
    verbose=0,
    classifiers='all',
    ignore_warnings=True,
    custom_metric=None, 
    predictions=False,
    random_state=42
)

In [None]:
# fitting the training data to the models
models, predictions = clf.fit(X_train, X_valid, y_train, y_valid)

In [None]:
# printing the models
models

- We see that the Ensemble Techniques like - Gradient Boosting, Extreme Gradient Boosting, RandomForest, AdaBoost have higher accuracy scores than others.

### Baseline Model - Logistic Regression

In [None]:
# logistic regression model
lg_clf = LogisticRegression(random_state=42, solver='liblinear')
lg_clf.fit(X_train, y_train)

In [None]:
# making predictions
y_pred = lg_clf.predict(X_valid)

In [None]:
# scoring the accuracy, roc_auc score
acc_score_lg_clf = accuracy_score(y_valid, y_pred)
roc_score_lg_clf = roc_auc_score(y_valid, lg_clf.predict_proba(X_valid)[:, 1])

In [None]:
# classification report
report_lg = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_lg)

- Now we have a baseline classifier with an accuracy of 75.8481% on our Validation Set.
- Let's see if we can improve this further using Hyperparameter Tuning on Logistic Regression itself, and then move on to Ensemble Techniques.

***Hyperparameter Tuning - Logistic Regression***

In [None]:
# setting the grid
param_distributions = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['newton-cg', 'lbfgs', 'linlinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 1500, 2000]
}

In [None]:
# setting the GridSearchCV
lr_model = LogisticRegression()
randomizedsearch_cv = RandomizedSearchCV(
    lr_model,
    param_distributions=param_distributions,
    cv=3,
    n_iter=150,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

In [None]:
randomized_results = randomizedsearch_cv.fit(X_train, y_train)

In [None]:
randomized_results

In [None]:
# getting the best estimator
randomizedsearch_cv.best_estimator_, randomizedsearch_cv.best_score_

In [None]:
# getting the predictions from best estimator
best_lr = randomizedsearch_cv.best_estimator_

y_pred = best_lr.predict(X_valid)
report_best_lr = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_best_lr)

### RandomForest

In [None]:
# randomforest model
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

In [None]:
# making predictions
y_pred = rf_clf.predict(X_valid)

In [None]:
# classification report
report_rf = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_rf)

***Hyperparameter Tuning - Random Forest Classifier***

In [None]:
# setting up the grid
param_distributions = {
    'n_estimators': [int(x) for x in np.linspace(200, 2000, 10)],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(100, 1000, 10)],
    'min_samples_split': [2, 3, 4, 7, 9],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

In [None]:
# setting the RandomizedSearchCV
rf_model = RandomForestClassifier()
randomizedsearch_cv = RandomizedSearchCV(
    rf_model,
    param_distributions=param_distributions,
    cv=5,
    n_iter=200,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

In [None]:
# fitting on the training set
randomizedsearch_cv.fit(X_train, y_train)

In [None]:
randomizedsearch_cv.best_estimator_, randomizedsearch_cv.best_score_

In [None]:
# getting the best estimator
best_rf = randomizedsearch_cv.best_estimator_

y_pred = best_rf.predict(X_valid)
report_best_rf = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_best_rf)

### XGBoost Classifier

In [None]:
# xgboost model
xgb_clf = XGBClassifier(random_state=42)
xgb_clf.fit(X_train, y_train)

In [None]:
# making predictions
y_pred = xgb_clf.predict(X_valid)

In [None]:
# classification report
report_xgb = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_xgb)

***Hyperparameter Tuning - XGBClassifier***

In [None]:
# setting the grid
param_distributions = {
    'max_depth': [3, 5, 6, 10, 15, 20],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': np.arange(0.5, 1.0, 0.1),
    'colsample_bytree': np.arange(0.4, 1.0, 0.1),
    'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
    'n_estimators': [100, 250, 500, 1000]
}

In [None]:
# setting the RandomizedSearchCV
xgb_model = XGBClassifier(random_state=42)
randomizedsearch_cv = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    cv=3,
    n_iter=200,
    verbose=1,
    n_jobs=-1
)

In [None]:
# fitting the training data 
randomizedsearch_cv.fit(X_train, y_train)

In [None]:
randomizedsearch_cv.best_estimator_, randomizedsearch_cv.best_score_

In [None]:
# getting the best estimator
best_xgb = randomizedsearch_cv.best_estimator_ 

y_pred = best_xgb.predict(X_valid)
report_best_xgb = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_best_xgb)

### LGBM Classifier

In [None]:
#lightgbm model
lgbm_clf = lgb.LGBMClassifier(random_state=42)
lgbm_clf.fit(X_train, y_train)

In [None]:
# making predictions
y_pred = lgbm_clf.predict(X_valid)

In [None]:
# classification report 
report_lgbm = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_lgbm)

***Hyperparameter Tuning - LGBMClassifier***

In [None]:
# setting the grid

parameters = {
    'num_leaves':[5, 8, 10, 15, 20, 30, 40, 50], 
    'min_child_samples':[10, 50, 100, 200, 300, 500],
    'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    'max_depth':[-1,5,10,20],
    'learning_rate':[0.05,0.1,0.2],
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
    'n_estimators': [100, 200, 400, 500, 1000, 1500, 2000]
}

In [None]:
# setting the RandomizedSearchCV
lgbm_model = lgb.LGBMClassifier(
    random_state=42, 
    silent=True, 
    metric='None', 
    n_jobs=-1,
)
randomizedsearch_cv = RandomizedSearchCV(
    estimator=lgbm_model,
    param_distributions=parameters,
    scoring='accuracy',
    cv=5,
    n_iter=200,
    verbose=1,
    n_jobs=-1,
)

In [None]:
# fitting the training data 
randomizedsearch_cv.fit(X_train, y_train)

In [None]:
randomizedsearch_cv.best_estimator_, randomizedsearch_cv.best_score_

In [None]:
# getting the best estimator
best_lgbm = randomizedsearch_cv.best_estimator_ 

y_pred = best_lgbm.predict(X_valid)
report_best_lgbm = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_best_lgbm)

### Ensemble - Voting Classifier

Combining the Best Performing Models

In [None]:
# importing and creating hyperparameters for the model
from sklearn.ensemble import VotingClassifier

estimators = [
    ('randomforest', best_rf),
    ('xgboost', best_xgb),
    ('lightgbm', best_lgbm)
]
voting = 'soft'

In [None]:
# creating the model
ensemble_model = VotingClassifier(
    estimators=estimators,
    voting=voting
)
ensemble_model.fit(X_train, y_train)

In [None]:
# making the predictions
y_pred = ensemble_model.predict(X_valid)

report_ensemble = classification_report(y_valid, y_pred, output_dict=True)
pprint(report_ensemble)

## Submissions

LightGBM was the best performing model and therefore we will use it to make the submissions.

In [None]:
sample_sub = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sample_sub.head()

In [None]:
# final predictions
final_predictions = ensemble_model.predict(test.drop(['PassengerId'], axis=1))
passenger_ids = test['PassengerId']

In [None]:
final_sub = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Transported': final_predictions
})

In [None]:
final_sub["Transported"] = final_sub['Transported'].replace({0:'False', 1:'True'})

In [None]:
final_sub.head()

In [None]:
final_sub.shape

In [None]:
# saving to csv
final_sub.to_csv('submission.csv', index=False)