In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center>
<img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRM0nCU7nbBzEKPg7RdAUe1sJM98rGo8FJSQQIdgj1rA0IrwBFf2fzw1oLAnK65pSvaBjk&usqp=CAU" width=500 height=500 />
</center>

<a id="1"></a> <br>
# 1. Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import plot_confusion_matrix,classification_report,confusion_matrix,accuracy_score

<a id="1"></a> <br>
# 2. Import Dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
df_data = pd.concat([df_train,df_test]).reset_index(drop=True)

In [None]:
df_data.head()

<span style="font-family:cursive; color:black;font-size:16px">

Feature descriptions:</span>

> * **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
> * **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
> * **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
> * **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
> * **Destination** - The planet the passenger will be debarking to.
> * **Age** - The age of the passenger.
> * **VIP** - Whether the passenger has paid for special VIP service during the voyage.
> * **RoomService**, **FoodCourt**, **ShoppingMall**, **Spa**, **VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
> * **Name** - The first and last names of the passenger.
> * **Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

**Missing values**

In [None]:
df_data.isnull().sum()

**Duplicates**

In [None]:
print('Sum Of Duplicated Data : {}'.format(df_data.duplicated().sum()))

**Cardinality of features**

In [None]:
df_data.nunique()

**Data types**

In [None]:
df_data.dtypes

Machine learning models usually do not work on text, so we need to convert the data into numeric (int64 or float64). This will be done later.

<a id="1"></a> <br>
# 3. Exploratory Data Analysis

In [None]:
# Figure size
plt.figure(figsize=(8,8))

# Pie plot
df_train.Transported.value_counts().plot(kind='pie',shadow=True,explode=[0.05,0.05],autopct='%1.1f%%').set_title("Target distribution")

**Continuous features**

In [None]:
# Figure size
plt.figure(figsize=(12,5))

# Histogram
sns.histplot(x='Age', hue='Transported',data=df_train, kde=True, binwidth=1)
plt.title('Age distribution')
plt.xlabel('Age (years)')

As we can see : 
* 0-18 year olds were **more** likely to be transported than not.
* 18-25 year olds were **less** likely to be transported than not.
* Over 25 year olds were about **equally** likely to be transported than not.

So we can Create a new feature that indicates whether the passanger is a child, adolescent or adult.

In [None]:
# Cost features
cost_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Plot Cost features
fig=plt.figure(figsize=(8,16))
for i,feature in enumerate(cost_features):
    ax=fig.add_subplot(5,1,i+1)
    sns.histplot(x= feature,data=df_train, hue ='Transported', bins=30,kde=True, axes=ax)
    plt.ylim([0,100])
    ax.set_title(feature)
fig.tight_layout()  # Improves appearance a bit
plt.show()        

As we can see:
* Most people don't spend any money and People who were transported tended to spend less.
* RoomService, Spa and VRDeck have different distributions to FoodCourt and ShoppingMall .

**Categorical features**

In [None]:
# Categorical features
Catego_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Plot categorical features
fig=plt.figure(figsize=(8,16))
for i,feature in enumerate(Catego_features):
    ax=fig.add_subplot(5,1,i+1)
    sns.countplot(x= feature,data=df_train, hue ='Transported', axes=ax,palette = "cool_r")
    ax.set_title(feature)
fig.tight_layout()  # Improves appearance a bit
plt.show()   

As we can see:
* VIP does not appear to be a useful feature whereas CryoSleep appears the be a very useful feature in contrast.

<a id="1"></a> <br>
# 4. Feature engineering

Bin age feature into groups. This will be helpful for filling missing values.

In [None]:
# New feature--Age_group
df_data['Age_group']=np.nan
df_data['Age_group'] = df_data['Age'].apply(lambda x: '-12_Age' if x<12 else
                                            '12-18_Age' if (x>=12) & (x<18) else
                                            '18-25_Age' if (x>=18) & (x<25) else
                                            '25-35_Age' if (x>=25) & (x<35) else
                                            '35-50_Age' if (x>=35) & (x<50) else
                                            '+50_Age' if x>=50 else None)

In [None]:
# Plot distribution of Age_group feature
plt.figure(figsize=(12,5))
sns.countplot(data=df_data[df_data.Transported.isnull()==False],palette = "cool_r", x='Age_group', hue='Transported', order=['-12_Age','12-18_Age','18-25_Age','25-35_Age','35-50_Age','+50_Age'])
plt.title('Age group distribution')

In [None]:
# New features --Cost and No_Cost
df_data['Cost'] = df_data[cost_features].sum(axis=1)
df_data['No_Cost'] = df_data['Cost'].apply(lambda x: 1 if x==0 else 0 ).astype(int)

In [None]:
# Plot distribution of Cost and No_Cost features 
fig=plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.histplot(data=df_data[df_data.Transported.isnull()==False], x='Cost', hue='Transported', bins=200)
plt.title('Total Cost (truncated)')
plt.ylim([0,200])
plt.xlim([0,20000])

plt.subplot(1,2,2)
sns.countplot(data=df_data[df_data.Transported.isnull()==False],palette = "cool_r", x='No_Cost', hue='Transported')
plt.title('No Cost indicator')
fig.tight_layout()

Extract passenger group and group size from PassengerId.

In [None]:
# New features --Passenger_Group and Passenger_Group_Size
df_data['Passenger_Group'] = df_data['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
df_data['Passenger_Group_Size'] = df_data['Passenger_Group'].apply(lambda x: df_data['Passenger_Group'].value_counts()[x])

In [None]:
# Plot distribution of 'Passenger_Group and Passenger_Group_Size features 
plt.figure(figsize=(20,4))
plt.subplot(1,2,1)
sns.histplot(data=df_data[df_data.Transported.isnull()==False], x='Passenger_Group', hue='Transported', binwidth=1)
plt.title('Passenger_Group')

plt.subplot(1,2,2)
sns.countplot(data=df_data[df_data.Transported.isnull()==False],palette = "cool_r", x='Passenger_Group_Size', hue='Transported')
plt.title('Passenger_Group_Size')
fig.tight_layout()

In [None]:
# New feature --Alone
df_data['Alone']= (df_data['Passenger_Group_Size']==1).astype(int)

In [None]:
# Plot distribution of Alone feature
plt.figure(figsize=(10,4))
sns.countplot(data=df_data[df_data.Transported.isnull()==False],palette = "cool_r", x='Alone', hue='Transported')
plt.title('Passenger travelling Alone or not')
plt.ylim([0,3000])

Extract deck, number and side from cabin feature.

In [None]:
# Replace NaN's with outliers for now
df_data['Cabin'].fillna('Z/9999/Z',inplace=True)

In [None]:
# New features --Cabin_deck, Cabin_number and Cabin_side features
df_data['Cabin_deck'] = df_data['Cabin'].apply(lambda x: x.split('/')[0])
df_data['Cabin_number'] = df_data['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
df_data['Cabin_side'] = df_data['Cabin'].apply(lambda x: x.split('/')[2])

In [None]:
# Put Nan's back in (we will fill these later)
df_data.loc[df_data['Cabin_deck']=='Z', 'Cabin_deck']=np.nan
df_data.loc[df_data['Cabin_number']==9999, 'Cabin_number']=np.nan
df_data.loc[df_data['Cabin_side']=='Z', 'Cabin_side']=np.nan

In [None]:
# Drop Cabin feature
df_data.drop('Cabin',axis=1, inplace=True)

In [None]:
# Plot distribution of Cabin_deck,Cabin_number and Cabin_side features
fig=plt.figure(figsize=(15,15))
plt.subplot(3,1,1)
sns.countplot(data=df_data[df_data.Transported.isnull()==False],palette = "cool_r", x='Cabin_deck', hue='Transported', order=['A','B','C','D','E','F','G','T'])
plt.title('Cabin deck')

plt.subplot(3,1,2)
sns.histplot(data=df_data[df_data.Transported.isnull()==False], x='Cabin_number', hue='Transported',binwidth=20)
plt.vlines(300, ymin=0, ymax=200, color='blue')
plt.vlines(600, ymin=0, ymax=200, color='blue')
plt.vlines(900, ymin=0, ymax=200, color='blue')
plt.vlines(1200, ymin=0, ymax=200, color='blue')
plt.vlines(1500, ymin=0, ymax=200, color='blue')
plt.vlines(1800, ymin=0, ymax=200, color='blue')
plt.title('Cabin number')
plt.xlim([0,2000])

plt.subplot(3,1,3)
sns.countplot(data=df_data[df_data.Transported.isnull()==False],palette = "cool_r", x='Cabin_side', hue='Transported')
plt.title('Cabin side')
fig.tight_layout()

As we can see from Cabin_number is grouped into Collections of 300 cabins. This means we can compress this feature into a categorical one, which indicates which Collection each passenger is in. Furthermore ، The cabin deck 'T' seems to be an outlier.

In [None]:
# New features 
df_data['Cabin_Part1']=(df_data['Cabin_number']<300).astype(int)   # one-hot encoding
df_data['Cabin_Part2']=((df_data['Cabin_number']>=300) & (df_data['Cabin_number']<600)).astype(int)
df_data['Cabin_Part3']=((df_data['Cabin_number']>=600) & (df_data['Cabin_number']<900)).astype(int)
df_data['Cabin_Part4']=((df_data['Cabin_number']>=900) & (df_data['Cabin_number']<1200)).astype(int)
df_data['Cabin_Part5']=((df_data['Cabin_number']>=1200) & (df_data['Cabin_number']<1500)).astype(int)
df_data['Cabin_Part6']=((df_data['Cabin_number']>=1500) & (df_data['Cabin_number']<1800)).astype(int)
df_data['Cabin_Part7']=(df_data['Cabin_number']>=1800).astype(int)

**Last name**

In [None]:
# Replace NaN's with outliers for now 
df_data['Name'].replace(np.nan,'Unknown Unknown', inplace=True)

# New feature - Last_Name
df_data['Last_Name']=df_data['Name'].apply(lambda x: x.split()[1])

# New feature - 'Family_Size
df_data['Family_Size'] = df_data['Last_Name'].apply(lambda x: df_data['Last_Name'].value_counts()[x])

In [None]:
# Put Nan's back in (we will fill these later)
df_data['Last_Name'].replace('Unknown',np.nan,inplace=True)
df_data.loc[df_data['Family_Size']>100,'Family_Size']=np.nan

In [None]:
# Drop name feature
df_data.drop('Name',axis=1, inplace=True)

In [None]:
# Plot distribution of Family_Size feature
plt.figure(figsize=(12,4))
sns.countplot(data=df_data[df_data.Transported.isnull()==False],palette = "cool_r", x='Family_Size', hue='Transported')
plt.title('Family Size')

<a id="1"></a> <br>
# 5. Missing Values

In [None]:
df_data.isnull().sum()

In [None]:
missing_values = [col for col in df_data.columns if df_data[col].isnull().any()]
missing_values.remove('Transported')
S=pd.DataFrame(df_data[missing_values].isnull().sum(), columns=['Number Of Missing'])
S['Percentage Of Missing'] = np.round(S['Number Of Missing'].apply(lambda x: x*100/len(df_data)),2)
S

In [None]:
# Countplot of number of missing values by passenger
df_data['NaN_count']=df_data.isnull().sum(axis=1)
plt.figure(figsize=(12,5))
sns.countplot(data=df_data, x='NaN_count', hue='Transported',palette = "cool_r")
plt.title('Number of missing entries by passenger')
df_data.drop('NaN_count', axis=1, inplace=True)

*As we can see:*

Missing values are independent of the target and about 25% of all passengers have at least 1 missing value.

The **easiest** way to deal with missing values is to just use the **median** for continuous features and the **mode** for categorical features but To improve accuracy of our models then we need to look for patterns within the missing data. The way to do this is by looking at the **joint distribution** of features.

**HomePlanet:**

**HomePlanet and Passenger_Group**


In [None]:
# Joint distribution of Passenger_Group and HomePlanet
idx1 = df_data.groupby(['Passenger_Group','HomePlanet'])['HomePlanet'].size().unstack().fillna(0).astype(int)
idx1.head()

In [None]:
sns.countplot((idx1>0).sum(axis=1))
plt.title('Number of unique home planets per group')

This shows that everyone in the same group comes from the same home planet. So we can fill the missing HomePlanet values according to the group.

In [None]:
# Missing values before
before_HomePlanet_miss = df_data.HomePlanet.isnull().sum()

# Passengers with missing HomePlanet and in a group with known HomePlanet
idx2=df_data[df_data.HomePlanet.isnull()][(df_data[df_data.HomePlanet.isnull()]['Passenger_Group']).isin(idx1.index)].index

# Fill corresponding missing values
df_data.loc[idx2,'HomePlanet'] = df_data.loc[idx2,'Passenger_Group'].map(lambda x: idx1.idxmax(axis=1)[x])

# Print number of missing values left
print('Number of HomePlanet missing values before:',before_HomePlanet_miss)
print('Number of HomePlanet missing values after:',df_data['HomePlanet'].isnull().sum())

**HomePlanet and CabinDeck**

In [None]:
# Joint distribution of Cabin_deck and HomePlanet
idx3 = df_data.groupby(['Cabin_deck','HomePlanet'])['HomePlanet'].size().unstack().fillna(0).astype(int)
idx3.head()

In [None]:
# Heatmap of missing values
plt.figure(figsize=(10,4))
sns.heatmap(idx3.T, annot=True, fmt='g', cmap='winter')

*As we can see:*
* Passengers on decks A, B, C or T came from Europa.
* Passengers on deck G came from Earth.
* Passengers on decks D, E or F came from multiple planets.

In [None]:
# Missing values before
before_HomePlanet_miss = df_data.HomePlanet.isnull().sum()

# For Decks A, B, C or T 
df_data.loc[(df_data.HomePlanet.isnull()) & (df_data.Cabin_deck.isin(['A','B','C','T'])), 'HomePlanet' ]='Europa'

# For Deck G
df_data.loc[(df_data.HomePlanet.isnull()) & (df_data.Cabin_deck=='G'), 'HomePlanet' ]='Earth'

# Print number of missing values left
print('Number of HomePlanet missing values before:',before_HomePlanet_miss)
print('Number of HomePlanet missing values after:',df_data['HomePlanet'].isnull().sum())

**HomePlanet and Last_Name**

In [None]:
# Joint distribution of Last_Name and HomePlanet
idx4=df_data.groupby(['Last_Name','HomePlanet'])['HomePlanet'].size().unstack().fillna(0).astype(int)
idx4.head()

In [None]:
# Countplot of unique values
plt.figure(figsize=(10,4))
sns.countplot((idx4>0).sum(axis=1))
plt.title('Number of unique planets per LastName')

**Excellent!** Everyone with the same LastName comes from the same home planet.

In [None]:
# Missing values before
before_HomePlanet_miss = df_data.HomePlanet.isnull().sum()

# Passengers with missing HomePlanet and in a family with known HomePlanet
idx5=df_data[df_data['HomePlanet'].isna()][(df_data[df_data['HomePlanet'].isna()]['Last_Name']).isin(idx4.index)].index

# Fill corresponding missing values
df_data.loc[idx5,'HomePlanet']=df_data.iloc[idx5,:]['Last_Name'].map(lambda x: idx4.idxmax(axis=1)[x])

# Print number of missing values left
print('Number of HomePlanet missing values before:',before_HomePlanet_miss)
print('Number of HomePlanet missing values after:',df_data['HomePlanet'].isnull().sum())

In [None]:
# Only 10 HomePlanet missing values remain.
df_data.loc[df_data.HomePlanet.isnull(),['PassengerId','HomePlanet','Destination','Cabin_deck']]

In [None]:
# Joint distribution of Last_Name and HomePlanet
idx5=df_data.groupby(['Destination','HomePlanet'])['HomePlanet'].size().unstack().fillna(0).astype(int)
idx5.head()

In [None]:
# Heatmap of missing values
plt.figure(figsize=(10,4))
sns.heatmap(idx5.T, annot=True, fmt='g', cmap='winter')

Most people heading towards TRAPPIST-1e came from Earth so it makes sense to guess they came from there. But remember from earlier, no one on deck D came from Earth so we need to filter these out.

In [None]:
# Missing values before
before_HomePlanet_miss = df_data.HomePlanet.isnull().sum()

# Fill remaining HomePlanet missing values with Earth (if not on deck D) or Mars (if on Deck D)
df_data.loc[(df_data.HomePlanet.isnull()) & (df_data.Cabin_deck != 'D'),'HomePlanet']='Earth'
df_data.loc[(df_data.HomePlanet.isnull()) & (df_data.Cabin_deck == 'D'),'HomePlanet']='Mars'

# Print number of missing values left
print('Number of HomePlanet missing values before:',before_HomePlanet_miss)
print('Number of HomePlanet missing values after:',df_data['HomePlanet'].isnull().sum())

**Great**. mission (Missing Values for HomePlanet) complete .

**Destination:**

In [None]:
df_data.Destination.value_counts()/len(df_data.Destination) * 100

Since about 68% of the destination Column is related to TRAPPIST-1e, So we use the mode to fill the missing values.

In [None]:
# Missing values before
before_destination_miss=df_data['Destination'].isnull().sum()

# Fill missing Destination values with mode
df_data.loc[(df_data['Destination'].isnull()), 'Destination']='TRAPPIST-1e'

# Print number of missing values left
print('Number of Destination missing values before:',before_destination_miss)
print('Number of Destination missing values after:',df_data['Destination'].isnull().sum())

**Last_Name and Passenger_Group:**

The reason we are filling missing Last Names is because we will use Last Names later to fill missing values of other features. It also means we can improve the accuracy of the family size featue.

In [None]:
idx6 = df_data[df_data.Passenger_Group_Size>1].groupby(['Passenger_Group','Last_Name']).size().unstack().fillna(0).astype(int)
idx6.head()

In [None]:
# Countplot of unique values
plt.figure(figsize=(10,4))
sns.countplot((idx6>0).sum(axis=1),palette = "cool_r")
plt.title('Number of unique Last Name by Passenger Group')

The majority (83%) of groups contain only 1 family. So let's fill missing Last_Name according to the majority Last_Name in that Passenger Group.

In [None]:
# Missing values before
before_Last_Name_miss=df_data['Last_Name'].isnull().sum()

# Passengers with missing Last Name and in a Passenger Group with known majority Last Name
idx7=df_data[df_data['Last_Name'].isna()][(df_data[df_data['Last_Name'].isna()]['Passenger_Group']).isin(idx6.index)].index

# Fill corresponding missing values
df_data.loc[idx7,'Last_Name']=df_data.iloc[idx7,:]['Passenger_Group'].map(lambda x: idx6.idxmax(axis=1)[x])

# Print number of missing values left
print('Number of Last_Name missing values before:',before_Last_Name_miss)
print('Number of Last_Name missing values after:',df_data['Last_Name'].isnull().sum())

That is the best we can do. We don't have to get rid of all of these missing values because we will end up dropping the Last_Name feature anyway. However, we can update the family size feature.

In [None]:
df_data.isnull().sum()

In [None]:
# fills NaN's 
df_data['Last_Name'].fillna('Unknown', inplace=True)

# Update family size feature
df_data['Family_Size'] = df_data['Last_Name'].apply(lambda x: df_data['Last_Name'].value_counts()[x])

# Put NaN's back in place of outliers
df_data.loc[df_data['Last_Name']=='Unknown','Last_Name']=np.nan

# Say unknown Last Name means no family
df_data.loc[df_data['Family_Size']>100,'Family_Size']=0

**Cabin_side and Passenger_Group:**

In [None]:
# Joint distribution of Passenger Group and Cabin features
j1=df_data[df_data['Passenger_Group_Size']>1].groupby(['Passenger_Group','Cabin_deck'])['Cabin_deck'].size().unstack().fillna(0).astype(int)
j2=df_data[df_data['Passenger_Group_Size']>1].groupby(['Passenger_Group','Cabin_number'])['Cabin_number'].size().unstack().fillna(0).astype(int)
j3=df_data[df_data['Passenger_Group_Size']>1].groupby(['Passenger_Group','Cabin_side'])['Cabin_side'].size().unstack().fillna(0).astype(int)

In [None]:
# Countplots
fig=plt.figure(figsize=(16,4))
plt.subplot(1,3,1)
sns.countplot((j1>0).sum(axis=1),palette = "cool_r")
plt.title('Number Of Unique cabin decks per Passenger group')

plt.subplot(1,3,2)
sns.countplot((j2>0).sum(axis=1),palette = "cool_r")
plt.title('Number Of Unique cabin numbers per Passenger group')

plt.subplot(1,3,3)
sns.countplot((j3>0).sum(axis=1),palette = "cool_r")
plt.title('Number Of Unique cabin sides per Passenger group')
fig.tight_layout()

In [None]:
# Missing values before
before_Cabin_side_miss=df_data['Cabin_side'].isnull().sum()

# Passengers with missing Cabin side and in a Passenger Group with known majority Cabin side
idx8=df_data[df_data['Cabin_side'].isna()][(df_data[df_data['Cabin_side'].isna()]['Passenger_Group']).isin(j3.index)].index

# Fill corresponding missing values
df_data.loc[idx8,'Cabin_side']=df_data.iloc[idx8,:]['Passenger_Group'].map(lambda x: j3.idxmax(axis=1)[x])

# Print number of missing values left
print('Number of Cabin_side missing values before:',before_Cabin_side_miss)
print('Number of Cabin_side missing values after:',df_data['Cabin_side'].isnull().sum())

**Cabin_side and Last_Name:**

In [None]:
# Joint distribution of Last name and Cabin side
idx9=df_data[df_data['Passenger_Group_Size']>1].groupby(['Last_Name','Cabin_side'])['Cabin_side'].size().unstack().fillna(0).astype(int)
idx9.head()

In [None]:
# Countplot of unique values
plt.figure(figsize=(10,4))
sns.countplot((idx9>0).sum(axis=1),palette = "cool_r")
plt.title('Number of unique Last Name by Passenger Group')

The majority (73%) of Last Name contain only 1 Cabin Side. So let's fill missing Cabin Side according to the majority Cabin Side in that Last Name.

In [None]:
# Missing values before
before_Cabin_side_miss=df_data['Cabin_side'].isnull().sum()

# Passengers with missing Cabin side and in a Last Name with known majority Cabin side
idx10=df_data[df_data['Cabin_side'].isna()][(df_data[df_data['Cabin_side'].isna()]['Last_Name']).isin(idx9.index)].index

# Fill corresponding missing values
df_data.loc[idx10,'Cabin_side']=df_data.iloc[idx10,:]['Last_Name'].map(lambda x: idx9.idxmax(axis=1)[x])

# Drop Last_Name
df_data.drop('Last_Name', axis=1, inplace=True)

# Print number of missing values left
print('Number of Cabin_side missing values before:',before_Cabin_side_miss)
print('Number of Cabin_side missing values after:',df_data['Cabin_side'].isnull().sum())

The remaining missing values will be replaced with an outlier.

In [None]:
# Missing values before
before_Cabin_side_miss=df_data['Cabin_side'].isnull().sum()

# Fill remaining missing values with outlier
df_data.loc[df_data['Cabin_side'].isna(),'Cabin_side']='Z'

# Print number of missing values left
print('Number of Cabin_side missing values before:',before_Cabin_side_miss)
print('Number of Cabin_side missing values after:',df_data['Cabin_side'].isnull().sum())

**Cabin_deck and Passenger_Group:**

In [None]:
# Missing values before
before_Cabin_deck_miss=df_data['Cabin_deck'].isnull().sum()

# Passengers with missing Cabin deck and in a Passenger Group with known majority Cabin deck
idx11=df_data[df_data['Cabin_deck'].isna()][(df_data[df_data['Cabin_deck'].isna()]['Passenger_Group']).isin(j1.index)].index

# Fill corresponding missing values
df_data.loc[idx11,'Cabin_deck']=df_data.iloc[idx11,:]['Passenger_Group'].map(lambda x: j1.idxmax(axis=1)[x])

# Print number of missing values left
print('Number of Cabin_deck missing values before:',before_Cabin_deck_miss)
print('Number of Cabin_deck missing values after:',df_data['Cabin_deck'].isnull().sum())

**Cabin_deck and HomePlanet:**

In [None]:
# Joint distribution
df_data.groupby(['HomePlanet','Destination','Alone','Cabin_deck'])['Cabin_deck'].size().unstack().fillna(0).astype(int)

*As we can see:*
* Passengers from Mars are most likely in deck F.
* Passengers from Europa are most likely in deck C if travelling Alone and deck B otherwise.
* Passengers from Earth are most likely in deck G.

In [None]:
# Missing values before
before_Cabin_deck_miss=df_data['Cabin_deck'].isnull().sum()

# Fill missing values using the mode
id_index=df_data.loc[df_data['Cabin_deck'].isna(),'Cabin_deck'].index
df_data.loc[df_data['Cabin_deck'].isna(),'Cabin_deck']=df_data.groupby(['HomePlanet','Destination','Alone'])['Cabin_deck'].transform(lambda x: x.fillna(pd.Series.mode(x)[0]))[id_index]

# Print number of missing values left
print('Number of Cabin_deck missing values before:',before_Cabin_deck_miss)
print('Number of Cabin_deck missing values after:',df_data['Cabin_deck'].isnull().sum())

**CabinNumber and CabinDeck**

The cabin_number and Passenger_Group_number share a linear relationship on a deck by deck basis. We can therefore extrapolate the missing cabin numbers using linear regression on a deck by deck basis to get an approximate cabin number.

In [None]:
# Missing values before
before_Cabin_number_miss=df_data['Cabin_number'].isna().sum()

# Extrapolate linear relationship on a deck by deck basis
for deck in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
    # Features and labels
    X_CN=df_data.loc[~(df_data['Cabin_number'].isna()) & (df_data['Cabin_deck']==deck),'Passenger_Group']
    y_CN=df_data.loc[~(df_data['Cabin_number'].isna()) & (df_data['Cabin_deck']==deck),'Cabin_number']
    X_test_CN=df_data.loc[(df_data['Cabin_number'].isna()) & (df_data['Cabin_deck']==deck),'Passenger_Group']

    # Linear regression
    model_CN=LinearRegression()
    model_CN.fit(X_CN.values.reshape(-1, 1), y_CN)
    preds_CN=model_CN.predict(X_test_CN.values.reshape(-1, 1))
    
    # Fill missing values with predictions
    df_data.loc[(df_data['Cabin_number'].isna()) & (df_data['Cabin_deck']==deck),'Cabin_number']=preds_CN.astype(int)

# Print number of missing values left
print('Number of Cabin_number missing values before:',before_Cabin_number_miss)
print('Number of Cabin_number missing values after:',df_data['Cabin_number'].isna().sum())

Update Cabin_Parts:

In [None]:
df_data['Cabin_Part1']=(df_data['Cabin_number']<300).astype(int)   # one-hot encoding
df_data['Cabin_Part2']=((df_data['Cabin_number']>=300) & (df_data['Cabin_number']<600)).astype(int)
df_data['Cabin_Part3']=((df_data['Cabin_number']>=600) & (df_data['Cabin_number']<900)).astype(int)
df_data['Cabin_Part4']=((df_data['Cabin_number']>=900) & (df_data['Cabin_number']<1200)).astype(int)
df_data['Cabin_Part5']=((df_data['Cabin_number']>=1200) & (df_data['Cabin_number']<1500)).astype(int)
df_data['Cabin_Part6']=((df_data['Cabin_number']>=1500) & (df_data['Cabin_number']<1800)).astype(int)
df_data['Cabin_Part7']=(df_data['Cabin_number']>=1800).astype(int)

**VIP**

For missing values of Vip, We just using the Mode

In [None]:
# Missing values before
before_VIP_miss=df_data['VIP'].isnull().sum()

# Fill missing values using the mode
df_data.loc[df_data.VIP.isnull(),'VIP']=False

# Print number of missing values left
print('Number of VIP missing values before:',before_VIP_miss)
print('Number of VIP missing values after:',df_data['VIP'].isnull().sum())

**Age**

In [None]:
# Joint distribution
df_data.groupby(['HomePlanet','No_Cost','Alone','Cabin_deck'])['Age'].median().unstack().fillna(0).astype(int)

In [None]:
# Missing values before
before_Age_miss=df_data['Age'].isnull().sum()

# Fill missing values using the mode
id_index=df_data.loc[df_data['Age'].isna(),'Age'].index
df_data.loc[df_data['Age'].isna(),'Age']=df_data.groupby(['HomePlanet','No_Cost','Alone','Cabin_deck'])['Age'].transform(lambda x: x.fillna(x.median()))[id_index]

# Print number of missing values left
print('Number of Age missing values before:',before_Age_miss)
print('Number of Age missing values after:',df_data['Age'].isnull().sum())

Update Age_group:

In [None]:
df_data['Age_group'] = df_data['Age'].apply(lambda x: '-12_Age' if x<12 else
                                            '12-18_Age' if (x>=12) & (x<18) else
                                            '18-25_Age' if (x>=18) & (x<25) else
                                            '25-35_Age' if (x>=25) & (x<35) else
                                            '35-50_Age' if (x>=35) & (x<50) else
                                            '+50_Age' if x>=50 else None)

In [None]:
df_data.isnull().sum()

**CryoSleep**

In [None]:
# Joint distribution
df_data.groupby(['No_Cost','CryoSleep'])['CryoSleep'].size().unstack().fillna(0).astype(int)

In [None]:
# Missing values before
before_CryoSleep_miss=df_data['CryoSleep'].isnull().sum()

# Fill missing values using the mode
id_index=df_data.loc[df_data['CryoSleep'].isna(),'CryoSleep'].index
df_data.loc[df_data['CryoSleep'].isna(),'CryoSleep']=df_data.groupby(['No_Cost'])['CryoSleep'].transform(lambda x: x.fillna(pd.Series.mode(x)[0]))[id_index]

# Print number of missing values left
print('Number of CryoSleep missing values before:',before_CryoSleep_miss)
print('Number of CryoSleep missing values after:',df_data['CryoSleep'].isnull().sum())

**Cost features and CryoSleep**

We don't expect people in CryoSleep to be able to spend anything.

In [None]:
df_data.loc[df_data['CryoSleep']==True,cost_features].sum(axis=1).max()

In [None]:
# Missing values before
before_cost_features_miss=df_data[cost_features].isnull().sum().sum()

# Fill missing values using the mode
for col in cost_features:
    df_data.loc[(df_data[col].isnull()) & (df_data['CryoSleep']==True) ,col]=0

# Print number of missing values left
print('Number of cost_features missing values before:',before_cost_features_miss)
print('Number of cost_features missing values after:',df_data[cost_features].isnull().sum().sum())

**Cost and Others**

In [None]:
# Joint distribution
df_data.groupby(['HomePlanet','Alone','Age_group'])['Cost'].mean().unstack().fillna(0)

In [None]:
# Missing values before
before_cost_features_miss=df_data[cost_features].isnull().sum().sum()

# Fill missing values using the mode
for col in cost_features:
    id_index=df_data.loc[df_data[col].isna(),col].index
    df_data.loc[df_data[col].isna(),col]=df_data.groupby(['HomePlanet','Alone','Age_group'])[col].transform(lambda x: x.fillna(x.mean()))[id_index]
    
# Print number of missing values left
print('Number of cost_features missing values before:',before_cost_features_miss)
print('Number of cost_features missing values after:',df_data[cost_features].isnull().sum().sum())

Update Cost and No_Cost:

In [None]:
# Update Cost and No_Cost
df_data['Cost']=df_data[cost_features].sum(axis=1)
df_data['No_Cost']=(df_data['Cost']==0).astype(int)

In [None]:
li = list(df_data.columns)
li.remove('Transported')
for col in (li):
    print(col,df_data[col].isnull().sum())

No missing values left!

<a id="1"></a> <br>
# 6. Feature Transformation

**Drop unwanted features**

In [None]:
df_data.drop(['PassengerId', 'Passenger_Group', 'Passenger_Group_Size', 'Age_group', 'Cabin_number'], axis=1, inplace=True)

**In order to binning continuous features we are going to use 10 quantile base bins for Age column:**

In [None]:
names = ['1','2','3','4','5','6','7','8','9','10']
df_data['Age'] = pd.qcut(df_data['Age'], 10, labels = names)

Using LabelEncoder, we are going to convert non-numerical features to numerical type. LabelEncoder basically labels the classes from 0 to n. This process is necessary for models to learn from those features.

In [None]:
n_col= [col for col in df_data.columns if df_data[col].dtypes=='object']
n_col.append('Age')
for feature in n_col:
    df_data[feature] = LabelEncoder().fit_transform(df_data[feature])

In [None]:
#Put Nan's back in (for Test Data)
df_data.loc[df_data.Transported==2,'Transported']=np.nan

To finish with, we are going to one hot encoded non-ordinal features.

In [None]:
cat_features = ['HomePlanet', 'Destination']
for feature in cat_features:
    one_hot = pd.get_dummies(df_data[feature],prefix=feature)
    df_data = pd.merge(left=df_data,right=one_hot,left_index=True,right_index=True)

In [None]:
# Drop HomePlanet and Destination columns
df_data.drop(columns=['HomePlanet', 'Destination'],inplace=True)

<a id="1"></a> <br>
# 6. Modeling

For the modeling part we will compare 7 known algorithms, and proceed to evaluate their average accuracy by a stratified kfold cross validation procedure:

1: SVC<br>
2: Decision Tree<br>
3: Random Forest<br>
4: Extra Trees<br>
5: Gradient Boosting<br>
6: Multiple layer perceprton (neural network)<br>
7: KNN<br>

In [None]:
# ٍExtract Train and Test Data from df_data
df_train = df_data[df_data.Transported.isnull()==False]
df_test = df_data[df_data.Transported.isnull()==True]
df_test = df_test.drop('Transported',axis=1)

In [None]:
X = df_train.drop('Transported' ,axis=1)
y = df_train['Transported']

In [None]:
# Define classifiers:
kfold = StratifiedKFold(n_splits=10)
random_state = 42
classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(max_depth=4,random_state=random_state))
classifiers.append(RandomForestClassifier(n_estimators=5,random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())

In [None]:
# cross_val_score
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X, y, scoring = "accuracy", cv = kfold, n_jobs=4))

In [None]:
# cross_val_score Mean
cv_means = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())

In [None]:
# Create DataFram from Algorithms and CrossValMeans parameter
cv_res = pd.DataFrame({"Algorithm":["SVC","DecisionTree",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors"],"CrossValMeans":cv_means})
cv_res = cv_res.sort_values(by='CrossValMeans',ascending = False)

In [None]:
cv_res

In [None]:
# SPlitting train dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [None]:
# Classifiers
pipeline_SVC = Pipeline([('SVC_classifier',SVC(random_state=random_state))])
pipeline_DTC = Pipeline([('DTC_classifier',DecisionTreeClassifier(max_depth=4,random_state=random_state))])
pipeline_RF = Pipeline([('RF_classifier',RandomForestClassifier(n_estimators=5,random_state=random_state))])
pipeline_ET = Pipeline([('ET_classifier',ExtraTreesClassifier(random_state=random_state))])
pipeline_GB = Pipeline([('GB_classifier',GradientBoostingClassifier(random_state=random_state))])
pipeline_MLP = Pipeline([('MLP_classifier',MLPClassifier(max_iter=10000,random_state=random_state))])
pipeline_KNN = Pipeline([('KNN_classifier',KNeighborsClassifier())])

pipelines = [pipeline_SVC,pipeline_DTC,pipeline_RF,pipeline_ET,pipeline_GB,pipeline_MLP,pipeline_KNN]
pipe_dict = {0:'SVC',1:'DecisionTree',2:'RandomForest',3:'ExtraTrees',4:'GradientBoosting',
             5:'MLP',6:'KNeighbors'}

In [None]:
# Fit Model
for pipe in pipelines:
    pipe.fit(X_train,y_train)

In [None]:
# Calculate Test Accuracy
for i,model in enumerate(pipelines):
    print("{} Test Accuracy : {}\n".format(pipe_dict[i],model.score(X_test,y_test)))

In [None]:
# Calculate mean_absolute_error
for i,model in enumerate(pipelines):
    y_pred = model.predict(X_test)
    print("{} : mean_absolute_error : {}\n".format(pipe_dict[i],mean_absolute_error(y_test,y_pred)))

In [None]:
plot_confusion_matrix(pipeline_SVC,X_test,y_test)

In [None]:
y_pred_test_SVC = pipeline_SVC.predict(df_test)

In [None]:
y_pred_test_SVC

In [None]:
df_submission = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

In [None]:
# Sample Submission
output = pd.DataFrame({'PassengerId': df_submission.PassengerId, 'Transported': y_pred_test_SVC})
output.Transported.replace(0,False,inplace=True)
output.Transported.replace(1,True,inplace=True)

In [None]:
output.to_csv("spaceship-titanic_prediction.csv", index = False)