# <b>1 <span style='color:#4285f4'>|</span> Importing libraries</b>
- **For Data Manipulation**: numpy, pandas
- **For Data Visualization**: matplotlib, seaborn, plotly

In [None]:
# For ML models
from sklearn.linear_model import LinearRegression ,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC ,SVR
import xgboost as xgb
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

# For Data Processing
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 

# For Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# <b>2 <span style='color:#4285f4'>|</span> About the Dataset</b>

## Column Descriptions

- `PassengerId` - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- `HomePlanet` - The planet the passenger departed from, typically their planet of permanent residence.
- `CryoSleep` - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- `Cabin` - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- `Destination` - The planet the passenger will be debarking to.
- `Age` - The age of the passenger.
- `VIP` - Whether the passenger has paid for special VIP service during the voyage.
- `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- `Name` - The first and last names of the passenger.
- `Transported` - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

`PassengerId` & `Name` is not supposed to have any correlation with the rest of the data, even if some correlation is determined mathematically, it's just by "chance", so I will just eliminate these columns

In [None]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
del df['PassengerId']
del df['Name']
df

Categorical Features  
`HomePlanet`, `CryoSleep`, `Cabin`, `Destination`, `VIP`, `Transported`  
Continuous Features  
`Age`, `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck`  

## Column Statistics (of continuous data)

In [None]:
df.describe()[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].T.style.background_gradient(cmap='Blues')

## Column Statistics (of categorical data)

In [None]:
df.dropna()

In [None]:
print('Number of unique values in each categorical column:')
df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Transported']].nunique()

Number of unique values in `Cabin` is much higher than in other columns

In [None]:
print("Count of unique values in 'Cabin':")
df['Cabin'].value_counts()

Each unique value in the `Cabin` column repeats no more than 8 times, and most of them repeats only once or twice, so it is not possible to determine which value means what, so I will just eliminate this `Cabin` column

In [None]:
del df['Cabin']

Also, I will replace all missing values in categorical data with the string 'Unknown', to make it easy to visualize

In [None]:
for col in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported']:
    df[col] = df[col].fillna('Unknown')

In [None]:
fig = make_subplots(
    rows=3, cols=2, subplot_titles=("HomePlanet", "CryoSleep",
                                    "Destination","VIP",
                                    "Transported"),
    specs=[[{"type": "domain"}, {"type": "domain"}],
           [{"type": "domain"}, {"type": "domain"}],
           [{"type": "domain"}, {"type": "domain"}]],
)


colours = ['#4285f4', '#ea4335', '#fbbc05', '#34a853']

fig.add_trace(go.Pie(labels=np.array(df['HomePlanet'].value_counts().index),
                     values=[x for x in df['HomePlanet'].value_counts()],
                     textinfo='label+percent', rotation=-45, hole=.35,
                     marker_colors=colours),
              row=1, col=1)

fig.add_trace(go.Pie(labels=np.array(df['CryoSleep'].value_counts().index),
                     values=[x for x in df['CryoSleep'].value_counts()],
                     textinfo='label+percent', hole=.35,
                     marker_colors=colours),
              row=1, col=2)


fig.add_trace(go.Pie(labels=np.array(df['Destination'].value_counts().index),
                     values=[x for x in df['Destination'].value_counts()],
                     textinfo='label+percent', rotation=-45, hole=.35,
                     marker_colors=colours),
              row=2, col=1)

fig.add_trace(go.Pie(labels=np.array(df['VIP'].value_counts().index),
                     values=[x for x in df['VIP'].value_counts()],
                     textinfo='label+percent', rotation=-45, hole=.35,
                     marker_colors=colours),
              row=2, col=2)

fig.add_trace(go.Pie(labels=np.array(df['Transported'].value_counts().index),
                     values=[x for x in df['Transported'].value_counts()],
                     textinfo='label+percent', hole=.35,
                     marker_colors=colours),
              row=3, col=1)

fig.update_layout(height=1600, font=dict(size=14), showlegend=False)

fig.show()

# <b>3 <span style='color:#4285f4'>|</span> Exploratory Analysis</b>

In [None]:
fig = px.box(df, x="HomePlanet", y="Age", color='Transported')
fig.show()

In [None]:
fig = px.histogram(df, x="HomePlanet", color='Transported', color_discrete_map = {False:'#ea4335',True:'#4285f4', 'Unknown':'#fbbc05'})
fig.show()

fig = px.histogram(df, x="HomePlanet", color='VIP',
                   color_discrete_map = {False:'#ea4335',True:'#4285f4', 'Unknown':'#fbbc05'})
fig.show()

### Insights
- Most passengers who were transported to another dimension has departed from Earth, also *most people were departed from Earth*
- The likelihood of being transported to another dimension is the highest if the person departed from Europa, the second is Mars, and the third is Earth
- Not many people paid for special VIP service

In [None]:
fig = px.histogram(df, x="Age", color='CryoSleep', marginal='box',
                   color_discrete_map = {False:'#ea4335',True:'#4285f4', 'Unknown':'#fbbc05'})
fig.show()

### Insights
- Passengers elect to be put into CryoSleep regardless of their age

In [None]:
fig = px.histogram(df, x="Age", y="ShoppingMall", color='Transported', marginal='box',
                   color_discrete_map = {False:'#ea4335',True:'#4285f4', 'Unknown':'#fbbc05'})
fig.show()

fig = px.histogram(df, x="Age", y="RoomService", color='Transported', marginal='box',
                   color_discrete_map = {False:'#ea4335',True:'#4285f4', 'Unknown':'#fbbc05'})
fig.show()

fig = px.histogram(df, x="Age", y="FoodCourt", color='Transported', marginal='box',
                   color_discrete_map = {False:'#ea4335',True:'#4285f4', 'Unknown':'#fbbc05'})
fig.show()

fig = px.histogram(df, x="Age", y="Spa", color='Transported', marginal='box',
                   color_discrete_map = {False:'#ea4335',True:'#4285f4', 'Unknown':'#fbbc05'})
fig.show()

### Insights
- Old passengers shop less, and passengers with an age of less than 13 do not shop at all
- Maybe kids' shopping expenditures are deducted from their parent's account?
- Old passengers spend less for Room Service, and passengers with an age of less than 13 do not spend at all
- Maybe kids' expenditure for RoomService is deducted from their parent's account?
- Old passengers spend less for FoodCourt, and passengers with an age of less than 13 do not spend at all
- Maybe "Free food for kids"? or their food expenditure is deducted from their parents?
- Passengers that were transported spend significantly less than those who were not transported
- Old passengers pay less for Spa, and passengers with an age of less than 13 do not pay at all

# <b>4 <span style='color:#4285f4'>|</span> Data Cleaning & Preprocessing</b>

<h2>4.1 <span style='color:#4285f4'>|</span> Handling Missing Values</h2>  

In [None]:
df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
del df['PassengerId']
del df['Name']
del df['Cabin']

fig = px.imshow(df.isna().transpose(),color_continuous_scale="Blues")
fig.show()

There are not much missing values, so I will just eliminate all the rows which contain missing values

In [None]:
df = df.dropna().reset_index(drop=True)

<h2>4.2 <span style='color:#4285f4'>|</span> Normalizing Continuous Features</h2>  

In [None]:
df.describe().T[['min', 'max']].style.background_gradient(cmap='Blues')

These continuous have different ranges, so I am normalizing them to be between 0 and 1

In [None]:
for col in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    df[col] = df[col]/df[col].max()

<h2>4.3 <span style='color:#4285f4'>|</span> Encoding Categorical Features</h2>  

In [None]:
print('\nCategorical Columns\n')
df.select_dtypes(include=['O']).nunique()

Some categorical columns have 2 values, and some have more than 2 values.  
Here, I will convert the columns with 2 unique values to binary (either 1 or 0)  
And one-hot encode the other categorical columns which has more than 2 unique values  

In [None]:
# Integer encode columns with 2 unique values
for col in ['CryoSleep', 'VIP', 'Transported']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
# One-hot encode columns with more than 2 unique values
df = pd.get_dummies(df, columns=['HomePlanet', 'Destination'], prefix = ['HomePlanet', 'Destination'])

<h2>4.4 <span style='color:#4285f4'>|</span> Train-Val Split</h2>  

In [None]:
features = np.array(df[[col for col in df.columns if col!='Transported']])
labels = np.array(df['Transported'])

x_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=0)

# <b>5 <span style='color:#4285f4'>|</span> Models</b>

In [None]:
model_comparison = {}

<h2>5.1 <span style='color:#4285f4'>|</span> SVC</h2>  

In [None]:
parameters = {'C': [6,8,10,12,14,16], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

svc_model = SVC()

clf = GridSearchCV(svc_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['SVC'] = [accuracy_score(y_val,y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))

<h2>5.2 <span style='color:#4285f4'>|</span> DecisionTreeClassifier</h2>  

In [None]:
parameters = {'max_depth': [5,10,15,20]}

Tree_model = DecisionTreeClassifier()

clf = GridSearchCV(Tree_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['DecisionTreeClassifier'] = [accuracy_score(y_val,y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))


<h2>5.3 <span style='color:#4285f4'>|</span> KNeighborsClassifier</h2>  

In [None]:
parameters = {'n_neighbors': [10,20,30,40,50]}

K_model = KNeighborsClassifier()

clf = GridSearchCV(K_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['KNeighborsClassifier'] = [accuracy_score(y_val,y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))

<h2>5.4 <span style='color:#4285f4'>|</span> RandomForestClassifier</h2>  

In [None]:
parameters = {'n_estimators': [160,180,200], 'max_depth':[18,20,22,24]}

rf = RandomForestClassifier()

clf = GridSearchCV(rf, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['RandomForestClassifier'] = [accuracy_score(y_val,y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))

<h2>5.5 <span style='color:#4285f4'>|</span> XGBoost</h2>  

In [None]:
parameters = {'n_estimators': [100, 150, 200], 'max_depth':[16, 18, 20]}

xgboost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

clf = GridSearchCV(xgboost, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['XGBoost'] = [accuracy_score(y_val, y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))

<h2>5.6 <span style='color:#4285f4'>|</span> Model Comparison</h2>  

In [None]:
model_comparison_df = pd.DataFrame.from_dict(model_comparison).T
model_comparison_df.columns = ['Accuracy', 'F1 Score']
model_comparison_df = model_comparison_df.sort_values('F1 Score', ascending=True)
model_comparison_df.style.background_gradient(cmap='Blues')

In [None]:
fig = go.Figure(data=[
    go.Bar(name='F1 Score', y=model_comparison_df.index, x=model_comparison_df['F1 Score'], orientation='h'),
    go.Bar(name='Accuracy', y=model_comparison_df.index, x=model_comparison_df['Accuracy'], orientation='h')
])
fig.update_layout(barmode='group')
fig.show()

### Please Upvote this notebook as it encourages me in doing better.


![](http://68.media.tumblr.com/e1aed171ded2bd78cc8dc0e73b594eaf/tumblr_o17frv0cdu1u9u459o1_500.gif)