# <center>Spaceship Titanic<center>

The objective of this work is to present my solution for the Spaceship Titanic competition.

More information about the competition at https://www.kaggle.com/competitions/spaceship-titanic/overview

In [None]:
# First of all: Some necessary imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC , NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier , AdaBoostClassifier , GradientBoostingClassifier , VotingClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score , KFold , StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from IPython.display import Image
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
# Reading training and test files

train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
# Size of training dataset

train.shape

In [None]:
# Size of test dataset

test.shape

In [None]:
# Viewing the first 5 lines of the training dataset

train.head()

In [None]:
# Looking at data types

train.dtypes

In [None]:
# Some statistics

train.describe()

## Dealing with Missing Values

In [None]:
# First let's see which features have missing values ​​in the training dataset

train.isnull().sum()

In [None]:
# Doing the same for the test dataset

test.isnull().sum()

Let's adopt the following strategy: For numeric variables we will replace the missing values with the average of the column and for categorical variables we will replace the missing values with the most common category.

### HomePlanet

In [None]:
train['HomePlanet'].value_counts()

In [None]:
# Replacing missing values with "Earth"

train.loc[train['HomePlanet'].isnull() , 'HomePlanet'] = 'Earth'
test.loc[test['HomePlanet'].isnull() , 'HomePlanet'] = 'Earth'

### CryoSleep

In [None]:
train['CryoSleep'].value_counts()

In [None]:
# Replacing missing values with "False"

train.loc[train['CryoSleep'].isnull() , 'CryoSleep'] = False
test.loc[test['CryoSleep'].isnull() , 'CryoSleep'] = False

### Cabin

In [None]:
train['Cabin'].value_counts()

In [None]:
# Replacing missing values with "X/Num/XPS"

train.loc[train['Cabin'].isnull() , 'Cabin'] = 'X/Num/XPS'
test.loc[test['Cabin'].isnull() , 'Cabin'] = 'X/Num/XPS'

### Destination

In [None]:
train['Destination'].value_counts()

In [None]:
# Replacing missing values with "TRAPPIST-1e"

train.loc[train['Destination'].isnull() , 'Destination'] = 'TRAPPIST-1e'
test.loc[test['Destination'].isnull() , 'Destination'] = 'TRAPPIST-1e'

### Age

In [None]:
train['Age']

In [None]:
# Replacing the missing values with the average of the Age column

train.loc[train['Age'].isnull() , 'Age'] = train['Age'].mean()
test.loc[test['Age'].isnull() , 'Age'] = train['Age'].mean()

### VIP

In [None]:
train['VIP'].value_counts()

In [None]:
# Replacing missing values with "False"

train.loc[train['VIP'].isnull() , 'VIP'] = False
test.loc[test['VIP'].isnull() , 'VIP'] = False

### RoomService

In [None]:
train['RoomService']

In [None]:
# Replacing the missing values with the average of the RoomService column

train.loc[train['RoomService'].isnull() , 'RoomService'] = train['RoomService'].mean()
test.loc[test['RoomService'].isnull() , 'RoomService'] = train['RoomService'].mean()

### FoodCourt

In [None]:
train['FoodCourt']

In [None]:
# Replacing the missing values with the average of the FoodCourt column

train.loc[train['FoodCourt'].isnull() , 'FoodCourt'] = train['FoodCourt'].mean()
test.loc[test['FoodCourt'].isnull() , 'FoodCourt'] = train['FoodCourt'].mean()

### ShoppingMall

In [None]:
train['ShoppingMall']

In [None]:
# Replacing the missing values with the average of the ShoppingMall column

train.loc[train['ShoppingMall'].isnull() , 'ShoppingMall'] = train['ShoppingMall'].mean()
test.loc[test['ShoppingMall'].isnull() , 'ShoppingMall'] = train['ShoppingMall'].mean()

### Spa

In [None]:
train['Spa']

In [None]:
# Replacing the missing values with the average of the Spa column

train.loc[train['Spa'].isnull() , 'Spa'] = train['Spa'].mean()
test.loc[test['Spa'].isnull() , 'Spa'] = train['Spa'].mean()

### VRDeck

In [None]:
train['VRDeck']

In [None]:
# Replacing the missing values with the average of the VRDeck column

train.loc[train['VRDeck'].isnull() , 'VRDeck'] = train['VRDeck'].mean()
test.loc[test['VRDeck'].isnull() , 'VRDeck'] = train['VRDeck'].mean()

In [None]:
# Only the Name column with missing values. No problem, as we will not use this variable in the modeling.

train.isnull().sum()

In [None]:
test.isnull().sum()

## Feature Engineering

### HomePlanet

In [None]:
train['HomePlanet'].value_counts()

In [None]:
pd.get_dummies(train['HomePlanet'])

In [None]:
# Adding Earth, Europe and Mars variables to training and testing datasets

train = pd.concat([train , pd.get_dummies(train['HomePlanet'])] , axis = 1)
test = pd.concat([test , pd.get_dummies(test['HomePlanet'])] , axis = 1)

### CryoSleep

In [None]:
test['CryoSleep'].value_counts()

In [None]:
# Replacing False with 0 and True with 1

lista = []
for i in train['CryoSleep'] :
    if i == False :
        lista.append(0)
    else :
        lista.append(1)
train['CryoSleep'] = lista

lista = []
for i in test['CryoSleep'] :
    if i == False :
        lista.append(0)
    else :
        lista.append(1)
test['CryoSleep'] = lista

### Cabin

In [None]:
# Extracting deck and side from the Cabin variable and creating the Deck and Side variables in the training dataset

deck = []
num = []
side = []
for i in train['Cabin'].str.split('/') :
    deck.append(i[0])
    num.append(i[1])
    side.append(i[2])

train['Deck'] = deck
train['Side'] = side

In [None]:
# Doing the same for the test dataset

deck = []
num = []
side = []
for i in test['Cabin'].str.split('/') :
    deck.append(i[0])
    num.append(i[1])
    side.append(i[2])
    
test['Deck'] = deck
test['Side'] = side

In [None]:
pd.get_dummies(train['Deck'])

In [None]:
pd.get_dummies(train['Side'])

In [None]:
# Creating variables A, B, C, D, E, F, G, T, X, P, S and XPS in training and test datasets

train = pd.concat([train , pd.get_dummies(train['Deck'])] , axis = 1)
train = pd.concat([train , pd.get_dummies(train['Side'])] , axis = 1)

test = pd.concat([test , pd.get_dummies(test['Deck'])] , axis = 1)
test = pd.concat([test , pd.get_dummies(test['Side'])] , axis = 1)

### Destination

In [None]:
train['Destination'].value_counts()

In [None]:
# Adding TRAPPIST-1e, 55 Cancri and PSO J318.5-22 variables to training and testing datasets

train = pd.concat([train , pd.get_dummies(train['Destination'])] , axis = 1)
test = pd.concat([test , pd.get_dummies(test['Destination'])] , axis = 1)

### Age

In [None]:
SS = StandardScaler()

In [None]:
# Feature Scaling

train['Age_SS'] = SS.fit_transform(train[['Age']])
test['Age_SS'] = SS.fit_transform(test[['Age']])

### VIP

In [None]:
train['VIP'].value_counts()

In [None]:
# Replacing False with 0 and True with 1 in the training dataset

lista = []
for i in train['VIP'] :
    if i == False :
        lista.append(0)
    else :
        lista.append(1)

train['VIP'] = lista

In [None]:
# Doing the same on the test dataset

lista = []
for i in test['VIP'] :
    if i == False :
        lista.append(0)
    else :
        lista.append(1)
        
test['VIP'] = lista

### RoomService

In [None]:
# Feature Scaling

train['RoomService_SS'] = SS.fit_transform(train[['RoomService']])
test['RoomService_SS'] = SS.fit_transform(test[['RoomService']])

In [None]:
# Creating the variable RoomService? which receives the value 1 if someone spent on this service and 0 otherwise.

lista = []
for i in train['RoomService'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
train['RoomService?'] = lista

lista = []
for i in test['RoomService'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
test['RoomService?'] = lista

### FoodCourt

In [None]:
# Feature Scaling

train['FoodCourt_SS'] = SS.fit_transform(train[['FoodCourt']])
test['FoodCourt_SS'] = SS.fit_transform(test[['FoodCourt']])

In [None]:
# Creating the variable FoodCourt? which receives the value 1 if someone spent on this service and 0 otherwise.

lista = []
for i in train['FoodCourt'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
train['FoodCourt?'] = lista

lista = []
for i in test['FoodCourt'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
test['FoodCourt?'] = lista

### ShoppingMall

In [None]:
# Feature Scaling

train['ShoppingMall_SS'] = SS.fit_transform(train[['ShoppingMall']])
test['ShoppingMall_SS'] = SS.fit_transform(test[['ShoppingMall']])

In [None]:
# Creating the variable ShoppingMall? which receives the value 1 if someone spent on this service and 0 otherwise.

lista = []
for i in train['ShoppingMall'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
train['ShoppingMall?'] = lista

lista = []
for i in test['ShoppingMall'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
test['ShoppingMall?'] = lista

### Spa

In [None]:
# Feature Scaling

train['Spa_SS'] = SS.fit_transform(train[['Spa']])
test['Spa_SS'] = SS.fit_transform(test[['Spa']])

In [None]:
# Creating the variable Spa? which receives the value 1 if someone spent on this service and 0 otherwise.

lista = []
for i in train['Spa'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
train['Spa?'] = lista

lista = []

for i in test['Spa'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
test['Spa?'] = lista

### VRDeck

In [None]:
# Feature Scaling

train['VRDeck_SS'] = SS.fit_transform(train[['VRDeck']])
test['VRDeck_SS'] = SS.fit_transform(test[['VRDeck']])

In [None]:
# Creating the variable VRDeck? which receives the value 1 if someone spent on this service and 0 otherwise.

lista = []
for i in train['VRDeck'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
train['VRDeck?'] = lista

lista = []
for i in test['VRDeck'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
test['VRDeck?'] = lista

### N

Creating the variable N which indicates the **N**umber of services a person has spent their money on. The services considered are RoomService, FoodCourt, ShoppingMall, Spa and VRDeck. Therefore, this variable can assume the values 0, 1, 2, 3, 4 or 5.

In [None]:
train['N'] = train['RoomService?'] + train['FoodCourt?'] + train['ShoppingMall?'] + train['Spa?'] + train['VRDeck?']

In [None]:
test['N'] = test['RoomService?'] + test['FoodCourt?'] + test['ShoppingMall?'] + test['Spa?'] + test['VRDeck?']

### M

Creating the variable M which receives the value 1 if a person has spent on at least one service and receives the value 0 otherwise.

In [None]:
lista = []
for i in train['N'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
train['M'] = lista

lista = []
for i in test['N'] :
    if i == 0 :
        lista.append(0)
    else :
        lista.append(1)
test['M'] = lista

### Total_Spent

Creating the Total_Spent variable that indicates the sum of expenses in RoomService, FoodCourt, ShoppingMall, Spa and VRDeck.

In [None]:
train['Total_Spent'] = train['RoomService_SS']+ train['FoodCourt_SS']+train['ShoppingMall_SS']+train['Spa_SS']+train['VRDeck_SS']

In [None]:
test['Total_Spent'] = test['RoomService_SS']+test['FoodCourt_SS']+test['ShoppingMall_SS']+test['Spa_SS']+test['VRDeck_SS']

### Spa_SS+VRDeck_SS

In [None]:
train['Spa_SS+VRDeck_SS'] = train['Spa_SS'] + train['VRDeck_SS']
test['Spa_SS+VRDeck_SS'] = test['Spa_SS'] + test['VRDeck_SS']

### FoodCourt_SS+Spa_SS

In [None]:
train['FoodCourt_SS+Spa_SS'] = train['FoodCourt_SS'] + train['Spa_SS']
test['FoodCourt_SS+Spa_SS'] = test['FoodCourt_SS'] + test['Spa_SS']

## Modeling

After performing tests with several sets of variables, I concluded that the set below seems to provide the best results:

In [None]:
features =  ['Earth', 'Mars', 'A', 'B', 'C', 'D', 'E', 'F','T','X',
        'S', 'P', '55 Cancri e', 'PSO J318.5-22',
        'Age_SS' ,'RoomService?', 'FoodCourt?' , 'ShoppingMall?' , 
           'Spa?' , 'VRDeck?', 'N', 'M', 'CryoSleep', 'VIP' ,
             'RoomService_SS' , 'FoodCourt_SS' , 'ShoppingMall_SS' , 'Spa_SS' , 'VRDeck_SS' , 'Total_Spent' , 'Spa_SS+VRDeck_SS',
             'FoodCourt_SS+Spa_SS' ]

In [None]:
# Models

models = [LogisticRegression(max_iter = 1000) , KNeighborsClassifier() , SVC(), LinearSVC(max_iter = 50000) ,
          DecisionTreeClassifier(), GaussianNB() , RandomForestClassifier() , AdaBoostClassifier() ,
          GradientBoostingClassifier(), XGBClassifier(use_label_encoder = False) ,
          CatBoostClassifier() , LGBMClassifier() ]

In [None]:
# Model Name

models_name = ['Logistic Regression' , 'KNN' , 'SVC' , 'Linear SVC' , 'Decision Tree' , 'Naive Bayes' , 'Random Forest',
               'AdaBoost' , 'GradientBoosting' , 'XGB' , 'CatBoost' , 'LGBM']

In [None]:
train['Transported'] = train['Transported'].astype('int32')
X_train = train[features]
y_train = train['Transported']

In [None]:
# Storing each model's accuracy when doing a 3-fold cross-validation

cv_scores = []
for model in models :
    cv_scores.append(np.mean(cross_val_score(model , X_train, y_train, cv = 3 )))

In [None]:
# Displaying the  mean accuracy that each model obtained in cross-validation

dictionary = {'Model' : models_name , 'Mean Accuracy' : cv_scores}
df_cv = pd.DataFrame(dictionary)
df_cv

As seen above, the model that had the best performance in cross-validation was CatBoost, so we will choose this model. After a long process of hyperparameter optimization, I found a CatBoost model with the following hyperparameters as the best model:

**learning_rate = 0,01**

**n_estimators = 1500**

**depth = 6**

## First submission to Kaggle

In [None]:
model = CatBoostClassifier(learning_rate = 0.01 , n_estimators = 1500 , depth = 6)

In [None]:
model.fit(X_train , y_train)

In [None]:
test['Transported'] = model.predict(test[features])

In [None]:
test['Transported'] = test['Transported'].astype('bool')

In [None]:
df = test[['PassengerId' , 'Transported']]

In [None]:
df.to_csv('submission1.csv' , index = False)

This submission gave us a score of 0.80336 on Kaggle. Let's try to improve our score using Stacking.

## Stacking

In [None]:
estimators = [('catboost1', model) , ('catboost2' , model)]

final_estimator = LogisticRegression()

In [None]:
sc = StackingClassifier(estimators = estimators , final_estimator = final_estimator)

In [None]:
sc.fit(X_train , y_train)

## Second Submission to Kaggle

In [None]:
test['Transported'] = sc.predict(test[features])

In [None]:
test['Transported'] = test['Transported'].astype('bool')

In [None]:
df = test[['PassengerId' , 'Transported']]

In [None]:
df.to_csv('submission2.csv' , index = False)

In [None]:
Image('../input/imagem-score/kaggle.png')

As we can see in the image we managed to get an improvement in our score getting 0.80360