# Spaceship Titanic

This is a notebook rather long. It's not a try to get a good ranking in the competition. It is my second attempt to experiment and learn on feature analysis, imputation and engineering. As you can see, I have generated new features, but not all them were used finally. I've discarded several of them for the final model. They were just part of the data exploration.

I've got some ideas from other scientist's notebooks, and I thank them a lot. Their work really helped me.

1. [Data load](#data_load)
2. [Review data](#review_data)
3. [Feature engineering](#feature_engineering)
4. [Preparation of the training datasets](#preparation_datasets)
5. [Training the model](#training_model)
6. [Make the prediction](#make_prediction)

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

import warnings 
warnings.simplefilter(action='ignore')
pd.options.mode.chained_assignment = None 

<a id='data_load'></a>
# 1. Data load

In [None]:
# Load train dataset
train_df = pd.read_csv('../input/spaceship-titanic/train.csv', sep=',', encoding='UTF-8')

print(f'Dimensions - Train {train_df.shape}')

train_df.head()

In [None]:
# Load test dataset
test_df = pd.read_csv('../input/spaceship-titanic/test.csv', sep=',', encoding='UTF-8')

print(f'Dimensions - Test {test_df.shape}')

test_df.head()

In [None]:
train_df['Transported'].describe()
train_df.groupby(['Transported'])['Transported'].count().head()

<a id='review_data'></a>
# 2. Review data

In [None]:
# For the data analysis and to simplify feature engineering, I unify both datasets.

test_df['Transported'] = 'Unknown'
combi_df = pd.concat([train_df, test_df], ignore_index=True)

print(f'Train dataset: {len(train_df)}')
print(f'Test dataset: {len(test_df)}')
print(f'Combined: {len(combi_df)}')

In [None]:
# Check missing values

combi_df.isna().sum()/len(combi_df)

<a id='feature_engineering'></a>
# 3. Feature engineering

In [None]:
# Make a copy for feature engineering
combi_2 = combi_df.copy()

In [None]:
# General logical values conversion:
#   True --> 1
#   False --> 0

## Feature: *PassengerId*

In [None]:
combi_2['PassengerId'].describe()

In [None]:
# Extract the number of members in a group and de group ID
combi_2['GroupID'] = combi_2['PassengerId'].apply(lambda x: x.split('_')[0])
combi_2['GroupNum'] = combi_2['PassengerId'].apply(lambda x: x.split('_')[1]).astype(int)

In [None]:
# Get de number of members of each group
combi_2['GroupMembers'] = combi_2.groupby(['GroupID'])['GroupNum'].transform('max')

## Feature (new): *GroupMembers*

In [None]:
combi_2.groupby(['GroupMembers'])['GroupMembers'].count().sort_values(ascending = False).head(10)

In [None]:
# Let's see the relationship between GroupMembers and Transported
sns.countplot(x='GroupMembers', hue='Transported', palette='rocket', 
              data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *HomePlanet*

In [None]:
combi_2['HomePlanet'].describe()

In [None]:
combi_2.fillna('Unknown').groupby(['HomePlanet'])['HomePlanet'].count().head()

In [None]:
# I'll impute the missing values once all features are analyzed.

In [None]:
# Let's see the relationship between HomePlanet and Transported
sns.countplot(x='HomePlanet', hue='Transported', palette='rocket', 
              data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *CryoSleep*

In [None]:
combi_2['CryoSleep'].describe()

In [None]:
combi_2.fillna('Unknown').groupby(['CryoSleep'])['CryoSleep'].count().head()

In [None]:
# Impute missing data with the most frequent value (False)
combi_2.loc[combi_2['CryoSleep'].isna(), 'CryoSleep'] = False

In [None]:
# Let's see the relationship between CryoSleep and Transported
sns.countplot(x='CryoSleep', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

# In case of CryoSleep=True, it's highly probably to be transported.

In [None]:
# and convert logic values to float
combi_2['CryoSleep'] = combi_2['CryoSleep'].replace({False:0, True:1})

## Feature: *Cabin*

In [None]:
combi_2['Cabin'].describe()

In [None]:
combi_2.groupby(['Cabin'])['Cabin'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute the missing data with a ficticious value ('X/0/X')
combi_2['Cabin'].fillna('X/0/X', inplace=True)

In [None]:
# Extract the Deck and the Side from the Cabin code.
combi_2['Deck'] = combi_2['Cabin'].apply(lambda x: x.split('/')[0])
combi_2['CabinNum'] = combi_2['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
combi_2['Side'] = combi_2['Cabin'].apply(lambda x: x.split('/')[2])


# Get the number of passengers in the cabin
combi_2['CabinPassengers'] = combi_2.groupby(['Cabin'])['PassengerId'].transform('count').astype(int)

## Feature (new): *CabinNum* 

In [None]:
# As @samuelcortinhas pointed, it appears that CabinNum is grouped into segments of 300 cabins. 
# Perhaps, it is due to the location in the ship, but it's a guess.
plt.figure(figsize=(15,4))
sns.histplot(x='CabinNum', hue='Transported', binwidth=20, palette='rocket',
             data=combi_2[combi_2['Transported'] != 'Unknown'])
plt.vlines(300, ymin=0, ymax=200, color='black')
plt.vlines(600, ymin=0, ymax=200, color='black')
plt.vlines(900, ymin=0, ymax=200, color='black')
plt.vlines(1200, ymin=0, ymax=200, color='black')
plt.vlines(1500, ymin=0, ymax=200, color='black')
plt.vlines(1800, ymin=0, ymax=200, color='black')
plt.xlim([0,2000])

In [None]:
# Group the cabins into the chunks (one-hot encoding)

combi_2['CabinChunk_1'] = (combi_2['CabinNum'] < 300).astype(int)   # one-hot encoding
combi_2['CabinChunk_2'] = ((combi_2['CabinNum'] >= 300) & (combi_2['CabinNum'] < 600)).astype(int)
combi_2['CabinChunk_3'] = ((combi_2['CabinNum'] >= 600) & (combi_2['CabinNum'] < 900)).astype(int)
combi_2['CabinChunk_4'] = ((combi_2['CabinNum'] >= 900) & (combi_2['CabinNum'] < 1200)).astype(int)
combi_2['CabinChunk_5'] = ((combi_2['CabinNum'] >= 1200) & (combi_2['CabinNum'] < 1500)).astype(int)
combi_2['CabinChunk_6'] = ((combi_2['CabinNum'] >= 1500) & (combi_2['CabinNum'] < 1800)).astype(int)
combi_2['CabinChunk_7'] = (combi_2['CabinNum'] >= 1800).astype(int)

## Feature (new): *Deck*

In [None]:
combi_2['Deck'].sort_values().unique()

In [None]:
combi_2.groupby(['Deck'])['Deck'].count().head(20)

In [None]:
# Let's see the relationship between Deck and Transported
plt.figure(figsize=(10,4))
sns.countplot(x='Deck', hue='Transported', palette='rocket', order=['A','B','C','D','E','F','G','T'],
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# I'll impute the missing values later.

## Feature (new): *Side*

In [None]:
combi_2.groupby(['Side'])['Side'].count().head()

In [None]:
# Let's see the relationship between Side and Transported
sns.countplot(x='Side', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# Generate the dummy variables
combi_2['Side_copy'] = combi_2['Side'].copy()
combi_2 = pd.get_dummies(combi_2, columns=['Side'], drop_first=False)
combi_2 = combi_2.rename({'Side_copy': 'Side'}, axis=1)
combi_2.columns

## Feature (new): *CabinPassengers*

In [None]:
combi_2.groupby(['CabinPassengers'])['CabinPassengers'].count().head(10)

In [None]:
# In case the cabin 'X/0/X', the value should be 0
combi_2.loc[combi_2['Cabin'] == 'X/0/X', 'CabinPassengers'] = 0

In [None]:
# Let's see the relationship between CabinPassengers and Transported
plt.figure(figsize=(10,4))

sns.countplot(x='CabinPassengers', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *Destination*

In [None]:
combi_2['Destination'].describe()

In [None]:
combi_2.groupby(['Destination'])['Destination'].count().sort_values(ascending = False).head()

In [None]:
# Most of the passengers are going to TRAPPIST-1e. I'll impute this value to the missing data.
combi_2['Destination'].fillna('TRAPPIST-1e', inplace=True)

In [None]:
# Let's see the relationship between Destination and Transported
sns.countplot(x='Destination', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# and generate the dummy variables, keeping the original feature
combi_2['Destination_copy'] = combi_2['Destination'].copy()
combi_2 = pd.get_dummies(combi_2, columns=['Destination'], drop_first=False)
combi_2 = combi_2.rename({'Destination_copy': 'Destination'}, axis=1)
combi_2.columns

## Feature: *Age*

In [None]:
combi_2['Age'].describe()

In [None]:
combi_2.groupby(['Age'])['Age'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with a fictocious value (0). There are others fields with that value.
combi_2['Age'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between Age and Transported
n_bins = int(combi_2['Age'].max()+1)
plt.figure(figsize=(10,4))
sns.histplot(x='Age', hue='Transported', stat='density', bins=n_bins, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

# Kids are more likely to be transported

In [None]:
# Let's see the relationship between Deck and Transported
plt.figure(figsize=(12,4))
sns.histplot(data=train_df, x='Age', hue='Transported', binwidth=1, kde=True, palette='rocket')

# Kids are more likely to be transported

## Feature (new): *AgeGroup*

In [None]:
# I group ages in a new feature.
# I set numeric values, due to it is a ordered set.
combi_2['AgeGroup'] = 0
combi_2.loc[(combi_2['Age'] == 0), 'AgeGroup'] = 0
combi_2.loc[(combi_2['Age'] > 0) & (combi_2['Age'] < 18), 'AgeGroup'] = 1
combi_2.loc[(combi_2['Age'] >= 18) & (combi_2['Age'] < 26),'AgeGroup'] = 2
combi_2.loc[(combi_2['Age'] >= 26) & (combi_2['Age'] < 40),'AgeGroup'] = 3
combi_2.loc[(combi_2['Age'] >= 40), 'AgeGroup'] = 4

In [None]:
# Let's see the distribution
n_bins = int(combi_2['AgeGroup'].max()+1)
plt.figure(figsize=(10,4))

sns.countplot(x='AgeGroup', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *VIP*

In [None]:
combi_2['VIP'].describe()

In [None]:
combi_2.groupby(['VIP'])['VIP'].count().head()

In [None]:
# Impute missing data with the most frequent value (False)
combi_2.loc[combi_2['VIP'].isna(), 'VIP'] = False

In [None]:
# Let's see the relationship between VIP and Transported
sns.countplot(x='VIP', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# and convert logic values to float
combi_2['VIP'] = combi_2['VIP'].replace({False:0, True:1})

## Feature: *RoomService*

In [None]:
combi_2['RoomService'].describe()

In [None]:
combi_2.groupby(['RoomService'])['RoomService'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['RoomService'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between RoomService and Transported
sns.histplot(x='RoomService', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# In order to decrease skew in distributions, I remove outliers and apply the logarithmic transform to the feature.
clamp_max = np.percentile(combi_2['RoomService'], 80)
combi_2['RoomService_log']=np.log(1 + np.clip(combi_2['RoomService'], 0, clamp_max))

sns.histplot(x='RoomService_log', hue='Transported', stat='density', binwidth=1, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *FoodCourt*

In [None]:
combi_2['FoodCourt'].describe()

In [None]:
combi_2.groupby(['FoodCourt'])['FoodCourt'].count().sort_values(ascending=False).head(10)

In [None]:
combi_2['RoomService'].isna().sum()

In [None]:
# Impute missing data with the most frequent value.
combi_2['FoodCourt'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between FoodCourt and Transported
sns.histplot(x='FoodCourt', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# In order to decrease skew in distributions, I remove outliers and apply the logarithmic transform to the feature.
clamp_max = np.percentile(combi_2['FoodCourt'], 90)
combi_2['FoodCourt_log'] = np.log(1 + np.clip(combi_2['FoodCourt'], 0, clamp_max))

sns.histplot(x='FoodCourt_log', hue='Transported', stat='density', binwidth=1, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *ShoppingMall*

In [None]:
combi_2['ShoppingMall'].describe()

In [None]:
combi_2.groupby(['ShoppingMall'])['ShoppingMall'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['ShoppingMall'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between ShoppingMall and Transported
sns.histplot(x='ShoppingMall', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# In order to decrease skew in distributions, I remove outliers and apply the logarithmic transform to the feature.
clamp_max = np.percentile(combi_2['ShoppingMall'], 90)
combi_2['ShoppingMall_log']=np.log(1 + np.clip(combi_2['ShoppingMall'], 0, clamp_max))

sns.histplot(x='ShoppingMall_log', hue='Transported', stat='density', binwidth=1, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *Spa*

In [None]:
combi_2['Spa'].describe()

In [None]:
combi_2.groupby(['Spa'])['Spa'].count().sort_values(ascending=False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['Spa'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between Spa and Transported
sns.histplot(x='Spa', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# In order to decrease skew in distributions, I remove outliers and apply the logarithmic transform to the feature.
clamp_max = np.percentile(combi_2['Spa'], 80)
combi_2['Spa_log']=np.log(1 + np.clip(combi_2['Spa'], 0, clamp_max))

sns.histplot(x='Spa_log', hue='Transported', stat='density', binwidth=1, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature: *VRDeck*

In [None]:
combi_2['VRDeck'].describe()

In [None]:
combi_2.groupby(['VRDeck'])['VRDeck'].count().sort_values(ascending = False).head(10)

In [None]:
# Impute missing data with the most frequent value.
combi_2['VRDeck'].fillna(0, inplace=True)

In [None]:
# Let's see the relationship between VRDeck and Transported
sns.histplot(x='VRDeck', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# In order to decrease skew in distributions, I remove outliers and apply the logarithmic transform to the feature.
clamp_max = np.percentile(combi_2['VRDeck'], 80)
combi_2['VRDeck_log'] = np.log(1 + np.clip(combi_2['VRDeck'], 0, clamp_max))

sns.histplot(x='VRDeck_log', hue='Transported', stat='density', binwidth=1, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature (new): *Amenities*

In [None]:
# Total amount of the amenities
combi_2['Amenities'] = combi_2['RoomService'] + combi_2['FoodCourt'] + combi_2['ShoppingMall'] + combi_2['Spa'] + combi_2['VRDeck']

In [None]:
combi_2.groupby(['Amenities'])['Amenities'].count().sort_values(ascending = False).head(10)

In [None]:
# Let's see the relationship between the new feature and Transported
sns.histplot(x='Amenities', hue='Transported', stat='density', bins=100, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

In [None]:
# In order to decrease skew in distributions, I remove outliers and apply the logarithmic transform to the feature.
clamp_max = np.percentile(combi_2['Amenities'], 80)
combi_2['Amenities_log'] = np.log(1 + np.clip(combi_2['Amenities'], 0, clamp_max))

sns.histplot(x='Amenities_log', hue='Transported', stat='density', binwidth=1, element='step', palette='rocket', 
             data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature (new): *Amenities_bin*

In [None]:
# New feature to store whether the passenger has spent money.
combi_2.loc[combi_2['Amenities'] == 0, 'Amenities_bin'] = 0
combi_2.loc[combi_2['Amenities'] > 0, 'Amenities_bin'] = 1

In [None]:
combi_2.groupby(['Amenities_bin'])['Amenities_bin'].count().sort_values(ascending = False).head()

In [None]:
# Replace individual amenities by the new one, and generate new binary features
combi_2.loc[combi_2['RoomService'] == 0, 'RoomService_bin'] = 0
combi_2.loc[combi_2['RoomService'] > 0, 'RoomService_bin'] = 1

combi_2.loc[combi_2['FoodCourt'] == 0, 'FoodCourt_bin'] = 0
combi_2.loc[combi_2['FoodCourt'] > 0, 'FoodCourt_bin'] = 1

combi_2.loc[combi_2['ShoppingMall'] == 0, 'ShoppingMall_bin'] = 0
combi_2.loc[combi_2['ShoppingMall'] > 0, 'ShoppingMall_bin'] = 1

combi_2.loc[combi_2['Spa'] == 0, 'Spa_bin'] = 0
combi_2.loc[combi_2['Spa'] > 0, 'Spa_bin'] = 1

combi_2.loc[combi_2['VRDeck'] == 0, 'VRDeck_bin'] = 0
combi_2.loc[combi_2['VRDeck'] > 0, 'VRDeck_bin'] = 1

## Feature: *Name*

In [None]:
combi_2['Name'].describe()

In [None]:
combi_2.groupby(['Name'])['Name'].count().sort_values(ascending = False).head(10)

In [None]:
combi_2['Name'].fillna(' Unknown', inplace=True)
combi_2['LastName'] = combi_2['Name'].apply(lambda x:x.split(" ")[1])

In [None]:
combi_2.groupby(['LastName'])['LastName'].count().sort_values(ascending = False).head(10)

## Feature (new): *FamilyMembers*

In [None]:
# Get the number of members of every family (peoplpe with the same last name and in the same group)
combi_2['FamilyMembers'] = combi_2.groupby(['LastName', 'GroupID'])['PassengerId'].transform('count')

In [None]:
combi_2.groupby(['FamilyMembers'])['FamilyMembers'].count().head(10)

In [None]:
# In case the LastName is 'Unknown', the value should be 0 (no relatives)
combi_2.loc[combi_2['LastName'] == 'Unknown', 'FamilyMembers'] = 0

In [None]:
# Let's see the relationship between FamilyMembers and Transported
sns.countplot(x='FamilyMembers', hue='Transported', palette='rocket',
              data=combi_2[combi_2['Transported'] != 'Unknown'])

## Feature (new): *FamilyID*

In [None]:
# I generate a family ID. I't es very close to the Group ID, but conceptually are different.
combi_2['FamilyID'] = combi_2['LastName'] + '-' + combi_2['GroupID'] 

In [None]:
combi_2[['FamilyID', 'GroupID', 'FamilyMembers', 'GroupMembers']] \
    [(combi_2['GroupID'] == '9238')] \
    .sort_values('GroupID', ascending=False)

## Feature (new): *FamilyExpense*

In [None]:
# I see than the family expense is made by only one of the members (in general).
combi_2[['FamilyID', 'GroupID', 'FamilyMembers', 'GroupMembers', 'Amenities']] \
    [(combi_2['GroupID'] == '9238')] \
    .sort_values('GroupID', ascending=False)

In [None]:
combi_2['FamilyExpense'] = combi_2.groupby(['FamilyID'])['Amenities'].transform('sum')

## Feature (new): *IndividualExpense*

In [None]:
# It's more accurate the individual expense than the Amenities and the total family expense.
combi_2['IndividualExpense'] = combi_2['FamilyExpense'] / combi_2['FamilyMembers']

In [None]:
combi_2[['FamilyID', 'GroupID', 'FamilyMembers', 'GroupMembers', 'Amenities', 'FamilyExpense', 'IndividualExpense']] \
    [(combi_2['GroupID'] == '9238')] \
    .sort_values('GroupID', ascending=False)

In [None]:
# If the family expense is 0, the individual amount will be null or infinite. I set these values to 0.
combi_2.loc[combi_2['IndividualExpense'].isin([np.inf, -np.inf, np.nan]), 'IndividualExpense'] = 0

## Feature: *Transported*

In [None]:
combi_2['Transported'].unique()

In [None]:
# Convert logic values to integers
combi_2['Transported'] = combi_2['Transported'].replace({False:0, True:1, 'Unknown':2})

<a id='impute_missing'></a>
# 3.1 Impute missing values

## Impute: *HomePlanet*

In [None]:
# Joint distribution of Group and HomePlanet
GHP_gb = combi_2.groupby(['GroupID','HomePlanet'])['HomePlanet'].size().unstack().fillna(0)
GHP_gb.head()

In [None]:
# Missing values imputation
# (Extracted from SAMUEL CORTINHAS notebook)
HP_bef = combi_2['HomePlanet'].isna().sum()

# Passengers with missing HomePlanet and in a group with known HomePlanet
GHP_index = combi_2[combi_2['HomePlanet'].isna()][(combi_2[combi_2['HomePlanet'].isna()]['GroupID']).isin(GHP_gb.index)].index

# Fill corresponding missing values
combi_2.loc[GHP_index,'HomePlanet'] = combi_2.iloc[GHP_index,:]['GroupID'].map(lambda x: GHP_gb.idxmax(axis=1)[x])

# Print number of missing values left
print('#HomePlanet missing values before:', HP_bef)
print('#HomePlanet missing values after:', combi_2['HomePlanet'].isna().sum())

In [None]:
# Joint distribution of CabinDeck and HomePlanet
CDHP_gb = combi_2.groupby(['Deck','HomePlanet'])['HomePlanet'].size().unstack().fillna(0)

# Heatmap of missing values
plt.figure(figsize=(10,4))
sns.heatmap(CDHP_gb.T, annot=True, fmt='g', cmap='rocket_r')

In [None]:
# Missing values before
HP_bef = combi_2['HomePlanet'].isna().sum()

# Decks A, B, C or T came from Europa
combi_2.loc[(combi_2['HomePlanet'].isna()) & (combi_2['Deck'].isin(['A', 'B', 'C', 'T'])), 'HomePlanet'] = 'Europa'

# Deck G came from Earth
combi_2.loc[(combi_2['HomePlanet'].isna()) & (combi_2['Deck']=='G'), 'HomePlanet'] = 'Earth'

# Print number of missing values left
print('#HomePlanet missing values before:',HP_bef)
print('#HomePlanet missing values after:', combi_2['HomePlanet'].isna().sum())

In [None]:
# Joint distribution of LastName and HomePlanet
SHP_gb = combi_2.groupby(['LastName','HomePlanet'])['HomePlanet'].size().unstack().fillna(0)

In [None]:
# Missing values before
HP_bef = combi_2['HomePlanet'].isna().sum()

# Passengers with missing HomePlanet and in a family with known HomePlanet
SHP_index = combi_2[combi_2['HomePlanet'].isna()][(combi_2[combi_2['HomePlanet'].isna()]['LastName']).isin(SHP_gb.index)].index

# Fill corresponding missing values
combi_2.loc[SHP_index,'HomePlanet'] = combi_2.iloc[SHP_index,:]['LastName'].map(lambda x: SHP_gb.idxmax(axis=1)[x])

# Print number of missing values left
print('#HomePlanet missing values before:',HP_bef)
print('#HomePlanet missing values after:', combi_2['HomePlanet'].isna().sum())

In [None]:
# Only a few HomePlanet missing values left - let's look at them
combi_2[combi_2['HomePlanet'].isna()][['PassengerId','HomePlanet','Destination']]

# Joint distribution of HomePlanet and Destination
HPD_gb = combi_2.groupby(['HomePlanet','Destination'])['Destination'].size().unstack().fillna(0)

# Heatmap of missing values
plt.figure(figsize=(10,4))
sns.heatmap(HPD_gb.T, annot=True, fmt='g', cmap='rocket_r')

In [None]:
# Missing values before
HP_bef = combi_2['HomePlanet'].isna().sum()

# Fill remaining HomePlanet missing values with Earth (if not on deck D) or Mars (if on Deck D)
combi_2.loc[(combi_2['HomePlanet'].isna()) & ~(combi_2['Deck']=='D'), 'HomePlanet']='Earth'
combi_2.loc[(combi_2['HomePlanet'].isna()) & (combi_2['Deck']=='D'), 'HomePlanet']='Mars'

# Print number of missing values left
print('#HomePlanet missing values before:', HP_bef)
print('#HomePlanet missing values after:', combi_2['HomePlanet'].isna().sum())

In [None]:
# Generate the dummy variables, keeping the original feature
combi_2['HomePlanet_copy'] = combi_2['HomePlanet'].copy()
combi_2 = pd.get_dummies(combi_2, columns=['HomePlanet'], drop_first=False)
combi_2 = combi_2.rename({'HomePlanet_copy': 'HomePlanet'}, axis=1)
combi_2.columns

## Impute: *Deck*

In [None]:
# Missing values imputation (deck=X)
# (Extracted from SAMUEL CORTINHAS notebook)
# As Samuel pointed, groups tend to be on the same cabin deck.

In [None]:
# Missing values before
GCD_gb=combi_2[combi_2['GroupMembers']>1].groupby(['GroupID','Deck'])['Deck'].size().unstack().fillna(0)
CD_bef=combi_2['Deck'][combi_2['Deck'] == 'X'].count()

# Passengers with missing Cabin deck and in a group with known majority Cabin deck
GCD_index=combi_2['Deck'][combi_2['Deck'] == 'X'][(combi_2[combi_2['Deck'] == 'X']['GroupID']).isin(GCD_gb.index)].index

# Fill corresponding missing values
combi_2.loc[GCD_index,'Deck']=combi_2.iloc[GCD_index,:]['GroupID'].map(lambda x: GCD_gb.idxmax(axis=1)[x])

# Print number of missing values left
print('#Deck missing values before:', CD_bef)
print('#Deck missing values after:', combi_2['Deck'][combi_2['Deck'] == 'X'].count())

In [None]:
# Joint distribution
combi_2.groupby(['HomePlanet','Deck'])['Deck'].size().unstack().fillna(0)

In [None]:
Deck_count = combi_2['Deck'][combi_2['Deck'] == 'X'].count()

#Passengers from Mars are most likely in deck F.
combi_2.loc[(combi_2['Deck'] == 'X') & (combi_2['HomePlanet'] == 'Mars'), 'Deck'] = 'F'

#Passengers from Europa are (more or less) most likely in deck B.
combi_2.loc[(combi_2['Deck'] == 'X') & (combi_2['HomePlanet'] == 'Europa'), 'Deck'] = 'B'

#Passengers from Earth are (more or less) most likely in deck G.
combi_2.loc[(combi_2['Deck'] == 'X') & (combi_2['HomePlanet'] == 'Earth'), 'Deck'] = 'G'

# Print number of missing values left
print('#Deck missing values before:', Deck_count)
print('#Deck missing values after:', combi_2['Deck'][combi_2['Deck'] == 'X'].count())

In [None]:
# Last check
combi_2.groupby(['Deck'])['Deck'].count().head(20)

In [None]:
# Generate the dummy variables
combi_2['Deck_copy'] = combi_2['Deck'].copy()
combi_2 = pd.get_dummies(combi_2, columns=['Deck'], drop_first=False)
combi_2 = combi_2.rename({'Deck_copy': 'Deck'}, axis=1)
combi_2.columns

<a id='feature_selection'></a>
# 3.2 Feature selection

In [None]:
# Last checks. There is no object type
combi_2.select_dtypes('O').columns

In [None]:
# Check missing values
nan_cols = [i for i in combi_2.columns if combi_2[i].isnull().sum() > 0]
combi_2[nan_cols].isna().sum()/len(combi_2)

In [None]:
# Let's select the features to be included in the model
combi_2.columns

In [None]:
# Select most relevant features
train_selection = combi_2[combi_2['Transported'] != 2]

# remove features that won't use for prediction
drop_list = ['Name', 'LastName', 'PassengerId', 'GroupNum', 'GroupID', 'Cabin', 'Deck', 'Side', 'Destination', 'HomePlanet', 'FamilyID', 'CabinNum']
train_selection = train_selection.drop(columns=drop_list)

In [None]:
from xgboost import XGBClassifier

threshold = 0.01  # 0.01=1%

X_train_sel = train_selection.loc[:, train_selection.columns != 'Transported']
y_train_sel = train_selection['Transported']

#model_feat = RandomForestRegressor(random_state=101, max_depth=10)
model_feat = XGBClassifier(n_estimators=300,
                           max_depth=3,
                           objective='binary:hinge',
                           subsample=0.6,
                           colsample_bytree=0.5,
                           eta=0.1,
                           eval_metric='error',
                           use_label_encoder=False,
                           n_jobs=-1,
                           random_state=101)
model_feat.fit(X_train_sel, y_train_sel)

features = X_train_sel.columns
importances_feat = model_feat.feature_importances_
importances = importances_feat[importances_feat > threshold]
indices = np.argsort(importances)
range_idx = range(len(indices))
height = len(indices) // 2

plt.figure(figsize=(10,height))
plt.title('Feature Importances')
plt.barh(range_idx, importances[indices], align='center', fc='palevioletred')
plt.yticks(range_idx, [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

feature_list = [features[i] for i in indices]
print(feature_list)

In [None]:
# correlation between all features (original + generated)
plt.figure(figsize=(15, 10))
sns.heatmap(train_selection.corr(), vmin=-1, vmax=1, cmap='BrBG')

In [None]:
# Let's check the correlation of 'Transported' with the rest of features
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(train_selection.corr().abs()[['Transported']].sort_values(by='Transported', ascending=False), 
                      vmin=0, vmax=1, annot=True, cmap='rocket_r')

# As we can see, there are not good features. The variables with higher correlation are closely related to CryoSleep 
# (as we can see in the heatmap), so they do not give much information either. 

In [None]:
# We take the most relevant features, in base to the previous analysis
cols_sel = ['Transported']

cols_sel = cols_sel + ['CryoSleep', 'VIP']
cols_sel = cols_sel + ['Age', 'AgeKid']
cols_sel = cols_sel + ['GroupMembers', 'CabinPassengers']    # ['FamilyMembers']

cols_sel = cols_sel + ['Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']
cols_sel = cols_sel + ['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars']

cols_sel = cols_sel + ['Side_P', 'Side_S', 'Side_X']
cols_sel = cols_sel + ['Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T']
cols_sel = cols_sel + ['CabinChunk_1', 'CabinChunk_2', 'CabinChunk_3', 'CabinChunk_4', 'CabinChunk_5', 'CabinChunk_6', 'CabinChunk_7']

#cols_sel = cols_sel + ['RoomService_bin', 'FoodCourt_bin', 'ShoppingMall_bin', 'Spa_bin', 'VRDeck_bin']
#cols_sel = cols_sel + ['RoomService_log', 'FoodCourt_log', 'ShoppingMall_log', 'Spa_log', 'VRDeck_log'] 
cols_sel = cols_sel + ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cols_sel = cols_sel + ['FamilyExpense', 'Amenities_log']    # ['IndividualExpense', 'Amenities_bin']


combi_3 = combi_2.filter(cols_sel)
combi_3.columns

<a id='preparation_datasets'></a>
# 4. Preparation of the training and test datasets

In [None]:
combi_final = combi_3.copy()

# Split train and validation datasets
train_final = combi_final[combi_final['Transported'] != 2]

X_train_pred = train_final.loc[:, train_final.columns != 'Transported']
y_train_pred = train_final['Transported']

train_set, validation_set = train_test_split(train_final, test_size=0.2, random_state=101)  
X_train = train_set.loc[:, train_set.columns != 'Transported']
y_train = train_set['Transported'] 

X_validation = validation_set.loc[:, validation_set.columns != 'Transported']
y_validation = validation_set['Transported']

# Test dataset
test_final = combi_final[combi_final['Transported'] == 2]
test_final.drop('Transported', axis=1, inplace=True)
X_test = test_final

print(len(combi_final), "combi_final")
print(len(X_train), "X_train")
print(len(X_validation), "X_validation")
print(len(X_test), "X_test")

<a id='training_model'></a>
# 5. Training the model

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
num_f = len(X_train.columns)
print(f'# features: {num_f}')

In [None]:
classifier = XGBClassifier(n_estimators=200,
                           max_depth=4,
                           objective='binary:hinge',
                           subsample=0.6,
                           colsample_bytree=0.6,
                           eta=0.1,
                           eval_metric='error',
                           use_label_encoder=False,
                           n_jobs=-1,
                           random_state=101)

model = classifier.fit(X_train, y_train)

predicted_train = model.predict(X_train)
predicted_validation = model.predict(X_validation)

print(f"accuracy_score for train = {accuracy_score(y_train, predicted_train)}")
print(f"accuracy_score for validation = {accuracy_score(y_validation, predicted_validation)}")

<a id='make_prediction'></a>
# 6. Make prediction

In [None]:
# Train again, with the whole training dataset (train + validation)
model = classifier.fit(X_train_pred, y_train_pred)

In [None]:
prediction = pd.DataFrame(index=test_df.index)
prediction['PassengerId'] = test_df['PassengerId']
prediction['Transported'] = model.predict(X_test)

prediction['Transported'] = prediction['Transported'].replace({0:False, 1:True})

In [None]:
prediction.to_csv('submission.csv', sep=',', index=False)