# Spaceship Titanic
https://www.kaggle.com/c/spaceship-titanic/overview

**PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

**HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.

**CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

**Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
Destination - The planet the passenger will be debarking to.

**Age** - The age of the passenger.

**VIP** - Whether the passenger has paid for special VIP service during the voyage.
RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

**Name** - The first and last names of the passenger.

**Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.
test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.
sample_submission.csv - A submission file in the correct format.

**PassengerId** - Id for each passenger in the test set.

**Transported** - The target. For each passenger, predict either True or False.

## Dependencies

In [None]:
!pip install catboost optuna plotly

## Importing the libs

In [None]:
import numpy as np
import pandas as pd 

from matplotlib import pyplot as plt

import seaborn as sns

import plotly
import plotly.express as px
import plotly.graph_objects as go

plotly.offline.init_notebook_mode (connected = True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import optuna
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import accuracy_score

## Exploring the Data

In [None]:
train_set = pd.read_csv('../input/spaceship-titanic/train.csv')

test_set = pd.read_csv('../input/spaceship-titanic/test.csv')
test_set['Transported'] = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')['Transported']

dataset = train_set.append(test_set).set_index('PassengerId')

dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.describe().T

## Exploratory Data Analysis

In [None]:
plt.figure(figsize=(10,8))
passengers_by_survivence = dataset['Transported'].value_counts()
passengers_by_survivence = passengers_by_survivence.rename_axis('Transported').reset_index(name='Counts')

labels = ['No', 'Yes']

fig = px.pie(passengers_by_survivence, values='Counts', names=labels, title='Passergers Transported Percentage', color_discrete_sequence=px.colors.qualitative.G10)
fig.show(renderer="kaggle")

In [None]:
plt.figure(figsize=(10,8))

fig = px.histogram(dataset, x='Age', title='Age Distribuition', histnorm='', color_discrete_sequence=px.colors.qualitative.G10)
fig.show(renderer="kaggle")

In [None]:
fig = px.histogram(dataset, x='Age', color='Transported', title='Passengers ages by Transported condition', histnorm='', color_discrete_sequence=px.colors.qualitative.G10)
fig.show(renderer="kaggle")

In [None]:
dataset['VIP'] = dataset['VIP'].fillna(False)
dataset['CryoSleep'] = dataset['CryoSleep'].fillna(False)

In [None]:
plt.figure(figsize=(10,8))

survivor_count_per_sex = px.histogram(dataset, x="VIP", color="Transported", title='Passengers Transported by VIP or not', barmode='group', color_discrete_sequence=px.colors.qualitative.G10)
survivor_count_per_sex.show(renderer="kaggle")

In [None]:
dataset['CryoSleep'] = dataset['CryoSleep'].fillna(False)

In [None]:
plt.figure(figsize=(10,8))

survivor_count_per_sex = px.histogram(dataset, x="CryoSleep", color="Transported", title='Passengers Transported by Cryosleep Condition', barmode='group', color_discrete_sequence=px.colors.qualitative.G10)
survivor_count_per_sex.show(renderer="kaggle")

In [None]:
dataset['HomePlanet'].value_counts()

In [None]:
plt.figure(figsize=(10,8))
passengers_by_survivence = dataset['HomePlanet'].value_counts()
passengers_by_survivence = passengers_by_survivence.rename_axis('HomePlanet').reset_index(name='Counts')

values = dataset['HomePlanet'].value_counts().keys().tolist()

labels = ['No', 'Yes']

fig = px.pie(passengers_by_survivence, values='Counts', names=values, title='Passergers Transported by Home Planet', color_discrete_sequence=px.colors.qualitative.G10)
fig.show(renderer="kaggle")

In [None]:
dataset['Cabin'] = dataset['Cabin'].fillna('')

dataset['Cabin'].str.split('/', expand=True).info()

In [None]:
from sklearn.impute import SimpleImputer
dataset[['deck', 'number', 'side']] = dataset['Cabin'].str.split('/', expand=True)

most_frequent_imputer = SimpleImputer(missing_values=None, strategy='most_frequent')

most_frequent_imputer.fit(dataset[['number', 'side']])

dataset[['number', 'side']] = most_frequent_imputer.transform(dataset[['number', 'side']])

In [None]:
dataset['side'] = dataset['side'].map({'S':'Starboard', 'P':'Port'})

In [None]:
plt.figure(figsize=(10,8))

survivor_count_per_sex = px.histogram(dataset, x="side", color="Transported", title='Passengers Transported by Ship Side', barmode='group', color_discrete_sequence=px.colors.qualitative.G10)
survivor_count_per_sex.show(renderer="kaggle")

In [None]:
fig = px.histogram(dataset, x="deck", y="Transported", color='Transported',histfunc='count', title="Passengers Transported by Deck")

fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'})
fig.show(renderer="kaggle")

In [None]:
plt.figure(figsize=(10,8))

dataset['number'] = dataset['number'].astype('int32')

survivor_count_per_sex = px.histogram(dataset, x="number", color="Transported", title='Passengers Transported by cabin number', color_discrete_sequence=px.colors.qualitative.G10)
survivor_count_per_sex.show(renderer="kaggle")

In [None]:
plt.figure(figsize=(10,8))

survivor_count_per_sex = px.histogram(dataset, x="Destination", color="Transported", title='Passengers Transported by Destination', color_discrete_sequence=px.colors.qualitative.G10)
survivor_count_per_sex.show(renderer="kaggle")

## Feature Engineering

In [None]:
dataset.info()

### Drop collumns that will not be used

In [None]:
train = dataset.drop(columns=['Cabin', 'Name'])

In [None]:
currency_features = ['RoomService',	'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age', 'number']

train[currency_features] = train[currency_features].fillna(0)

In [None]:
from sklearn.impute import SimpleImputer
age_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

age_imputer.fit(train[currency_features])

train[currency_features]  = age_imputer.transform(train[currency_features])

In [None]:
most_frequent_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

most_frequent_imputer.fit(train[['HomePlanet', 'Destination']])

train[['HomePlanet', 'Destination']] = most_frequent_imputer.transform(train[['HomePlanet', 'Destination']])

In [None]:
train.info()

### Label Encoder

In [None]:
binary_categories = ['CryoSleep', 'VIP', 'Transported']  


for category in binary_categories:
  train[category] = train[category].apply(lambda value: 1 if value else 0)

train[binary_categories]

In [None]:
dependent_variable = train['Transported']

In [None]:
from sklearn.preprocessing import OneHotEncoder

categories = ['Destination', 'HomePlanet', 'deck', 'side']  

encoder = OneHotEncoder()

categorical_columns = encoder.fit_transform(train[categories]).toarray()

categorical_columns[:5]

In [None]:
categorical_columns = pd.DataFrame(categorical_columns)

categorical_columns.head()

In [None]:
categorical_columns = categorical_columns.reset_index()

binary_columns = train[['CryoSleep', 'VIP']].reset_index()

In [None]:
categorical_columns = pd.concat([categorical_columns, binary_columns], axis = 1)

In [None]:
categorical_columns

In [None]:
categorical_columns.drop(['index', 'PassengerId'], axis=1, inplace=True)

### Numerical Columns

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

numerical_features = ['Age', 'RoomService',	'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

numerical_columns = scaler.fit_transform(train[numerical_features])

numerical_columns = train[numerical_features].reset_index().drop('PassengerId', axis=1)
numerical_columns.head()

In [None]:
x_all = pd.concat([numerical_columns, categorical_columns], axis=1)

X = x_all.iloc[:,:].values
y = dependent_variable.iloc[:].values

In [None]:
x_all

In [None]:
X_train = X[:8693, :]
X_test = X[8693:, :]

y_train = y[:8693]
y_test = y[8693:]

In [None]:
def map_pred(result):
  return result == 1

def writePredictionResults(model, X_test, file_name):
  y_pred = model.predict(X_test)

  results = pd.Series(y_pred, index=test_set['PassengerId'], name='Transported').apply(map_pred)
  results.to_csv(file_name, header=True)

### Logistic Regression

In [None]:
log_reg = LogisticRegression(max_iter=7000)
log_reg.fit(X_train, y_train)

In [None]:
writePredictionResults(log_reg, X_test, 'log_reg.csv')

### Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_samples_leaf=1, min_samples_split=10,random_state = 0)
random_forest.fit(X_train, y_train)



In [None]:
random_forest_pred = random_forest.predict(X_test)

print(accuracy_score(y_test, random_forest_pred))
writePredictionResults(random_forest, X_test, 'random_forest.csv')

### XGB with Optuna

In [None]:
optuna_params = {
 'objective': 'binary:logistic',
 'tree_method':'gpu_hist',}

In [None]:
N_SPLITS = 20
folds = StratifiedKFold(n_splits = N_SPLITS, shuffle = True)

scores  = []
for fold, (train_index, test_index) in enumerate(folds.split(X, y)):  
    X_train, y_train = X[train_index], y[train_index]
    X_valid, y_valid = X[test_index], y[test_index]
    
    model = XGBClassifier(**optuna_params)
    model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], eval_metric = ['logloss'], early_stopping_rounds = 50, verbose = False)
    
    pred = model.predict(X_valid)

    valid_score = accuracy_score(y_valid, pred)
    
    print("Fold:", fold, "Accuracy:", valid_score)
    scores.append(valid_score)

In [None]:
print(f'Mean accuracy: {pd.Series(scores).mean():.4f}')

In [None]:
writePredictionResults(model, X_test, 'XGB_with_Optuna.csv')
