In [None]:
import sklearn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier

from lightgbm import LGBMClassifier
%matplotlib inline
sns.set()

# Import test and train datasets

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
target = df_train['Transported']

In [None]:
df_train

# Exploratory Data Analysis (EDA)

In [None]:
df_train.info()

In [None]:
df_train.describe()

Null values distribution

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df_train.isna(), cmap='terrain')

In [None]:
df_train.dtypes

Categorical data distribution

In [None]:
sns.countplot(x='Transported', data=df_train)

In [None]:
fig, ax =plt.subplots(1,4, figsize=(20, 6))
# plt.figure(figsize=(5, 5))
sns.countplot(x='HomePlanet', data=df_train, ax=ax[0])
sns.countplot(x='CryoSleep', data=df_train, ax=ax[1])
sns.countplot(x='Destination', data=df_train, ax=ax[2])
sns.countplot(x='VIP', data=df_train, ax=ax[3])
plt.show()

Lets see how many people have been successfully transported depending on CryoSleep

In [None]:
sns.catplot(x='Transported', col='CryoSleep', kind='count', data=df_train)

Take-away: People who had CryoSleep have high chances to be successfully transport

In [None]:
df_train.groupby(['CryoSleep']).agg({'Transported':'sum'})

In [None]:
print(df_train[df_train.CryoSleep == False].Transported.sum()/
      df_train[df_train.CryoSleep == False].Transported.count())
print(df_train[df_train.CryoSleep == True].Transported.sum()/
      df_train[df_train.CryoSleep == True].Transported.count())

33 % of passangers without CryoSleep have been seccessfully transported 
82 % of passangers with CryoSleep have been seccessfully transported 

In [None]:
sns.catplot(x='Transported', col='Destination', kind='count', data=df_train)

In [None]:
df_train.groupby(['Destination']).agg({'Transported':'sum'})

In [None]:
print(df_train[df_train.Destination == '55 Cancri e'].Transported.sum()/
      df_train[df_train.Destination == '55 Cancri e'].Transported.count())

print(df_train[df_train.Destination == 'PSO J318.5-22'].Transported.sum()/
      df_train[df_train.Destination == 'PSO J318.5-22'].Transported.count())

print(df_train[df_train.Destination == 'TRAPPIST-1e'].Transported.sum()/
      df_train[df_train.Destination == 'TRAPPIST-1e'].Transported.count())

Take-away: Passengers that travelled to '55 Cancri e' has slightly better chances

In [None]:
sns.catplot(x='Transported', col='HomePlanet', kind='count', data=df_train)

In [None]:
df_train.groupby(['HomePlanet']).agg({'Transported':'sum'})

In [None]:
print(df_train[df_train.HomePlanet == 'Earth'].Transported.sum()/
      df_train[df_train.HomePlanet == 'Earth'].Transported.count())

print(df_train[df_train.HomePlanet == 'Europa'].Transported.sum()/
      df_train[df_train.HomePlanet == 'Europa'].Transported.count())

print(df_train[df_train.HomePlanet == 'Mars'].Transported.sum()/
      df_train[df_train.HomePlanet == 'Mars'].Transported.count())

Take-away: Passengers that travelled from 'Europa' has better chances to be transported

Correlation matrix

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df_train.corr(), annot=True, cmap='viridis')

Age distribution 

In [None]:
df_train['Age'].hist()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x="Transported", y="Age", data=df_train)

# Data cleaning

In [None]:
df_train1 = df_train.drop(['PassengerId', 'Name'], axis=1)
df_test1 = df_test.drop(['PassengerId', 'Name'], axis=1)

In [None]:
df_train1.isna().sum()

Categorical values cleaning 

In [None]:
ob_col = df_train1.select_dtypes('object').columns[df_train1.select_dtypes('object').isna().any()].tolist()

In [None]:
for column in ob_col:
    df_train1[column] = df_train1[column].fillna(df_train1[column].mode()[0])

for column in ob_col:
    df_test1[column] = df_test1[column].fillna(df_test1[column].mode()[0])

In [None]:
df_train1

Numeric value cleaning

In [None]:
df_train2 = df_train1.copy()
df_test2 = df_test1.copy()

In [None]:
knn = KNNImputer()

train_num = df_train2.select_dtypes(np.number)
train_object = df_train2.select_dtypes(['object', 'bool'])

test_num = df_test2.select_dtypes(np.number)
test_object = df_test2.select_dtypes(['object', 'bool'])

train_num_transformed = knn.fit_transform(train_num)
train_num = pd.DataFrame(train_num_transformed, columns=train_num.columns, index=df_train.index)

test_num_transformed = knn.transform(test_num)
test_num = pd.DataFrame(test_num_transformed, columns=test_num.columns, index=df_test.index)

In [None]:
df_train3 = pd.concat([train_num, train_object], axis=1)
df_test3 = pd.concat([test_num, test_object], axis=1)

In [None]:
df_train3

In [None]:
df_test3

# Feature engineering

In [None]:
df_train3['total_spend'] =  df_train3['RoomService'] + df_train3['FoodCourt'] + df_train3['ShoppingMall'] + df_train3['Spa'] + df_train3['VRDeck']
df_test3['total_spend'] =  df_test3['RoomService'] + df_test3['FoodCourt'] + df_test3['ShoppingMall'] + df_test3['Spa'] + df_test3['VRDeck'] 

# Encoding categorical features

In [None]:
label_cols = ["HomePlanet", "CryoSleep", "Cabin", "Destination" ,"VIP"]

def label_encoder(train, test, columns):
    train = train.copy()
    test = test.copy()
    for col in columns:
#         train[col] = train[col].astype(str)
#         test[col] = test[col].astype(str)
        encoder = LabelEncoder()
        train[col] = encoder.fit_transform(train[col])
        test[col] =  encoder.fit_transform(test[col])
    return train, test

df_train4, df_test4 = label_encoder(df_train3, df_test3 ,label_cols)

Target transformation

In [None]:
encoder = LabelEncoder()
target = encoder.fit_transform(target)

In [None]:
df_train4

# Model selection

In [None]:
X = df_train4.drop(['Transported'], axis=1)
y = target

X_train , X_test , y_train , y_test = train_test_split(X, y, random_state = 12, test_size =0.33)

clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=False,
                     random_state=12,
                     classifiers='all')

models, predictions = clf.fit(X_train , X_test , y_train , y_test)

In [None]:
models

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=models, x="Model", y='Accuracy', marker='o', markersize=8) 
plt.xticks(rotation=90)
plt.title('Accuracy', fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=models, x="Model", y="F1 Score", marker='o', markersize=8) 
sns.lineplot(data=models, x="Model", y="ROC AUC", marker='o', markersize=8) 
plt.xticks(rotation=90)
plt.title('ROC AUC vs F1 Score', fontsize=18)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=models, x="Model", y="Time Taken", marker='o', markersize=8) 
plt.xticks(rotation=90)
plt.title('Training time', fontsize=18)
plt.show()

# Best classifier and feature importance

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
pred = lgbm.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
feature_importance = pd.DataFrame(lgbm.feature_importances_, index=X_train.columns, columns=['Feature'])

In [None]:
data=feature_importance.T.plot(kind='barh', figsize=(12, 6))

# Final prediction

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)

pred = model.predict(df_test4)
submission = pd.concat([df_test['PassengerId'], pd.DataFrame(pred, columns=['Transported'])], axis=1)

In [None]:
submission['Transported'] = submission['Transported'].astype(bool)

In [None]:
submission.to_csv('submission.csv', index=False)