In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders.one_hot import OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.metrics import classification_report

#### Importing the data

In [None]:
# Importing the train data
train = pd.read_csv(r'../input/spaceship-titanic/train.csv',sep=',')
train.head()

In [None]:
# Importing the test data
test = pd.read_csv(r'../input/spaceship-titanic/test.csv',sep=',')
test.head()

#### Cleaning the Data

In [None]:
train.isnull().sum()

In [None]:
train.fillna(0,inplace = True)

In the 'Cabin' axis there are 3 'hidden' variables. Let's split them to perform the transformation to numeric features later.

In [None]:
train[['Deck','Num','Side']]= train['Cabin'].str.split(pat='/',expand=True)

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.fillna(0,inplace=True)

In [None]:
train['Num'] = train['Num'].astype(int)

Making a One Hot Encoder with categorical features

In [None]:
one_hot_encoder = OneHotEncoder(cols=['HomePlanet','CryoSleep','Destination','VIP','Deck','Side'])

In [None]:
train = one_hot_encoder.fit_transform(train)

Excluding the categorical data

In [None]:
train.drop(['PassengerId','Name','Cabin'],inplace = True,axis=1)

#### Spliting the data

Initially, we will split the data into training and testing. For the data below, I'll split it as 80/20.

In [None]:
y = train['Transported']
X = train.drop(['Transported'],axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#### Random Forest Classifier

First, we'll use Random Forest to see how it will perform.

In [None]:
clf = RandomForestClassifier(max_depth=2,random_state=0)

In [None]:
clf.fit(X_train, y_train)

Random Forest performed well, let's test ADABoost now to see how it performs.

In [None]:
predict_RF = clf.predict(X_val)

accuracy_score(y_val, predict_RF)

In [None]:
cm_randomforest = confusion_matrix(y_val, predict_RF)
disp_rf = ConfusionMatrixDisplay(confusion_matrix=cm_randomforest,
                                display_labels=clf.classes_)
disp_rf.plot()
plt.show()

In [None]:
print(classification_report(y_val, predict_RF))

#### ADABoosting Classifier

In [None]:
regr = AdaBoostClassifier(random_state=42, n_estimators=100)

In [None]:
regr.fit(X_train, y_train)

In [None]:
predict_ADA = regr.predict(X_val)

In [None]:
accuracy_score(y_val, predict_ADA)

In [None]:
cm_ada = confusion_matrix(y_val, predict_ADA)
disp_ada = ConfusionMatrixDisplay(confusion_matrix=cm_ada,
                                display_labels=regr.classes_)
disp_ada.plot()
plt.show()

In [None]:
print(classification_report(y_val, predict_ADA))

Finally, let's test Gradient Boosting to see if it will be more accurate.

#### Gradient Boosting Classifier

In [None]:
GBC = GradientBoostingClassifier(n_estimators = 100,random_state=42,max_depth=2,learning_rate=1.0,max_features=2)


In [None]:
GBC.fit(X_train, y_train)

In [None]:
predict_gbc = GBC.predict(X_val)

In [None]:
accuracy_score(y_val, predict_gbc)

In [None]:
cm_gbc = confusion_matrix(y_val, predict_gbc)
disp_gbc = ConfusionMatrixDisplay(confusion_matrix=cm_gbc,
                                display_labels=GBC.classes_)
disp_gbc.plot()
plt.show()

In [None]:
print(classification_report(y_val, predict_gbc))

#### Conclusion


Finally, for the data in question we had a better performance using Gradient Boosting. However, we also obtained satisfactory results with Random Forest and ADABoosting.

In [None]:
print('Random Forest:',accuracy_score(y_val, predict_RF))
print('ADABoost:',accuracy_score(y_val, predict_ADA))
print('Gradient Boosting Classifier:',accuracy_score(y_val, predict_gbc))