This notebook contains EDA of Titanic Spaceship dataset.


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
print (f'Train Set: {df_train.shape}')
print (f'Test Set: {df_test.shape}')

In [None]:
df_train.head()

# Missing Values

- Generally less than 3% of missing values in each columns for both train and test set

In [None]:
pd.DataFrame(df_train.isnull().sum(), columns = ['missing_count']).assign(missing_pct = lambda x: x['missing_count']/df_train.shape[0]*100)

In [None]:
pd.DataFrame(df_test.isnull().sum(), columns = ['missing_count']).assign(missing_pct = lambda x: x['missing_count']/df_test.shape[0]*100)

# Individual Level Features

## Transported (Target)

- This is a balanced dataset

In [None]:
ax = df_train['Transported'].value_counts().plot(kind = 'bar')
ax.set_ylabel('No of Passengers')
ax.set_xlabel('Transported')
ax.set_title('Target Distribution')

## HomePlanet

HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16, 5))

ax0 = df_train['HomePlanet'].value_counts().plot(kind = 'bar', ax = ax[0])
ax0.set_xlabel('Home Planet')
ax0.set_ylabel('No of Passengers')
ax0.set_title('No of Passengers vs Home Planet')

ax = df_train.groupby('HomePlanet').agg({'Transported':'mean'}).plot(kind = 'bar', ax = ax[1])
ax.set_xlabel('Home Planet')
ax.set_ylabel('Proportion of Passengers')
ax.set_title('Proportion of Transported Passengers vs Home Planet')

## CryoSleep

CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

- ~ 1/3 of passengers are in Cryo Sleep
- Almost 80% of passengers in cryo sleep are transported compared to 30% of passengers not in cryo sleep
- This could be an important feature for predicting which passengers are transported

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16, 5))

ax0 = df_train['CryoSleep'].value_counts().plot(kind = 'bar', ax = ax[0])
ax0.set_xlabel('Cryo Sleep')
ax0.set_ylabel('No of Passengers')
ax0.set_title('No of Passengers vs Cryo Sleep')

ax1 = df_train.groupby('CryoSleep').agg({'Transported':'mean'}).plot(kind = 'bar', ax = ax[1])
ax1.set_xlabel('Cryo Sleep')
ax1.set_ylabel('Proportion of Passengers')
ax1.set_title('Proportion of Transported Passengers vs Cryo Sleep')

## Cabin

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

- Split the `Cabin` string into 3 parts: deck, num and side

In [None]:
df_train[['Deck', 'Num', 'Side']] = df_train['Cabin'].str.split('/', expand = True).fillna('Missing')

### Deck

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16, 5))

ax0 = df_train['Deck'].value_counts().sort_index().plot(kind = 'bar', ax = ax[0])
ax0.set_xlabel('Deck')
ax0.set_ylabel('No of Passengers')
ax0.set_title('No of Passengers vs Deck')

ax1 = df_train.groupby('Deck').agg({'Transported':'mean'}).plot(kind = 'bar', ax = ax[1])
ax1.set_xlabel('Deck')
ax1.set_ylabel('Proportion of Passengers')
ax1.set_title('Proportion of Transported Passengers vs Deck')

### Side

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16, 5))

ax0 = df_train['Side'].value_counts().sort_index().plot(kind = 'bar', ax = ax[0])
ax0.set_xlabel('Side')
ax0.set_ylabel('No of Passengers')
ax0.set_title('No of Passengers vs Side')

ax1 = df_train.groupby('Side').agg({'Transported':'mean'}).plot(kind = 'bar', ax = ax[1])
ax1.set_xlabel('Side')
ax1.set_ylabel('Proportion of Passengers')
ax1.set_title('Proportion of Transported Passengers vs Side')

## Destination

Destination - The planet the passenger will be debarking to

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16, 5))

ax0 = df_train['Destination'].value_counts().sort_index().plot(kind = 'bar', ax = ax[0])
ax0.set_xlabel('Destination')
ax0.set_ylabel('No of Passengers')
ax0.set_title('No of Passengers vs Destination')

ax1 = df_train.groupby('Destination').agg({'Transported':'mean'}).plot(kind = 'bar', ax = ax[1])
ax1.set_xlabel('Destination')
ax1.set_ylabel('Proportion of Passengers')
ax1.set_title('Proportion of Transported Passengers vs Destination')

## VIP

VIP - Whether the passenger has paid for special VIP service during the voyage.

In [None]:
fig, ax = plt.subplots(1,2, figsize = (16, 5))

ax0 = df_train['VIP'].value_counts().sort_index().plot(kind = 'bar', ax = ax[0])
ax0.set_xlabel('VIP')
ax0.set_ylabel('No of Passengers')
ax0.set_title('No of Passengers vs VIP')

ax1 = df_train.groupby('VIP').agg({'Transported':'mean'}).plot(kind = 'bar', ax = ax[1])
ax1.set_xlabel('VIP')
ax1.set_ylabel('Proportion of Passengers')
ax1.set_title('Proportion of Transported Passengers vs VIP')

## Age

Age - The age of the passenger.
- Median age of 27 years old
- There are 178 passengers of 0 years old
- Age group of 0-9 years old have higher probability of being transported

In [None]:
ax = sns.histplot(df_train, x = 'Age', binwidth = 1)
ax.set_ylabel('No of Passengers')
ax.set_xlabel('Age')
ax.set_title('Passengers Age Distribution')

In [None]:
df_train['Age'].describe()

In [None]:
agegroup_mapper = {0:'0-9', 1:'10-19', 2:'20-29', 3:'30-39', 4:'40-49', 5:'50-59', 6:'60-69', 7:'70-79', 8:'80-89'}
df_train['AgeGroup'] = df_train['Age'].apply(lambda x: np.floor(x/10)).map(agegroup_mapper)
ax = (pd.pivot_table(df_train, index = 'AgeGroup', columns = 'Transported', values = 'PassengerId', aggfunc = 'count')
      .rename(columns = {True: 'True', False:'False'})
      .assign(PctTransported = lambda x: x['True']/(x['True']+x['False'])*100)
      .reset_index()
      .plot(kind = 'bar', x = 'AgeGroup', y = 'PctTransported'))
ax.set_ylabel('% of Passengers')
ax.set_xlabel('Age Group')
ax.set_title('% of Passengers Transported by Age Group')

## Spending

- Find the total spend across all spending areas

In [None]:
fill_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df_train[fill_cols] = df_train[fill_cols].fillna(0)
df_train['TotalSpend'] = df_train['RoomService'] + df_train['FoodCourt'] + df_train['ShoppingMall'] + df_train['Spa'] + df_train['VRDeck']

In [None]:
ax = sns.histplot(df_train, x = 'TotalSpend', hue = 'Transported', binwidth = 1000)
ax.set_ylabel('No of Passengers')
ax.set_xlabel('Spending')
ax.set_title('Passengers Total Spending Distribution')
ax.set_xlim(0,10000)

In [None]:
df_train.groupby('Transported')['TotalSpend'].describe()

# Group Level Features

PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

- Identify which group a passenger is from
- Find group level features

In [None]:
df_train[['PassengerGroup', 'PassengerNo']] = df_train['PassengerId'].str.split('_', expand = True)

In [None]:
print (f'Number of Groups: {df_train.PassengerGroup.nunique()}')

## Number of Passengers Per Group

Distribution of passengers across groups
- Most groups only have 1 passengers
- The largest group have 8 passengers

In [None]:
ax = df_train.PassengerGroup.value_counts().value_counts().plot(kind = 'bar')
ax.set_ylabel('No of Groups')
ax.set_xlabel('No of Passengers')
ax.set_title('Passenger Distribution Across Groups')

## Group Transported
- Does the entire group get Transported together?
- Higher chances that all members of a group get transported together if the group is smaller

In [None]:
(df_train
 .groupby('PassengerGroup', as_index = False)
 .agg({'PassengerNo':'count', 'Transported': lambda x: sum(x == True)})
 .assign(AllTransported = lambda x: x['Transported'] == x['PassengerNo'])
 .groupby(['PassengerNo', 'AllTransported'], as_index = False)
 .agg({'PassengerGroup':'count'})
 .pivot(index = 'PassengerNo', columns = 'AllTransported', values = 'PassengerGroup')
 .fillna(0)
 .rename(columns = {False: 'False', True:'True'})
 .assign(PctAllTransported = lambda x: x['True']/(x['True'] + x['False'])))

Are passengers with groups more likely to be transported?

In [None]:
df_train['GroupSize'] = df_train.groupby('PassengerGroup')['PassengerId'].transform('nunique')
ax = df_train.groupby('GroupSize', as_index = False).agg({'Transported': 'mean'}).plot(kind = 'bar', x = 'GroupSize', y = 'Transported')
ax.set_ylabel('Probability of Transported')
ax.set_xlabel('Group Size')
ax.set_title('Probability of Transported by Group Size')

## Group Home Planet
- Does the entire group come from the same Home Planet?
- It appears that passengers within the same group always come from the same HomePlanet

In [None]:
(df_train
 .dropna(subset = ['HomePlanet'])
 .groupby('PassengerGroup', as_index = False)
 .agg({'PassengerNo':'count', 'HomePlanet': 'nunique'})
 .groupby('HomePlanet', as_index = False)
 .agg({'PassengerNo':'count'})
 .rename(columns = {'HomePlanet':'HomePlanetCount', 'PassengerNo':'PassengerGroup'})
)

In [None]:
(df_train
 .dropna(subset = ['Destination'])
 .groupby('PassengerGroup', as_index = False)
 .agg({'PassengerNo':'count', 'Destination': 'nunique'})
 .groupby('Destination', as_index = False)
 .agg({'PassengerNo':'count'})
 .rename(columns = {'Destination':'DestinationCount', 'PassengerNo':'PassengerGroup'})
)

## Group Cryo Sleep

- Does all the members of the group cryo sleep together?
- Group larger groups have lesser likelihood of group cryo sleep

In [None]:
(df_train
 .groupby('PassengerGroup', as_index = False)
 .agg({'PassengerNo':'count', 'CryoSleep': lambda x: sum(x == True)})
 .assign(AllCryoSleep = lambda x: x['CryoSleep'] == x['PassengerNo'])
 .groupby(['PassengerNo', 'AllCryoSleep'], as_index = False)
 .agg({'PassengerGroup':'count'})
 .pivot(index = 'PassengerNo', columns = 'AllCryoSleep', values = 'PassengerGroup')
 .fillna(0)
 .rename(columns = {False: 'False', True:'True'})
 .assign(PctGroupCryoSleep = lambda x: x['True']/(x['True'] + x['False'])))

## Group VIP

In [None]:
(df_train
 .groupby('PassengerGroup', as_index = False)
 .agg({'PassengerNo':'count', 'VIP': lambda x: sum(x == True)})
 .assign(VIP = lambda x: x['VIP'] == x['PassengerNo'])
 .groupby(['PassengerNo', 'VIP'], as_index = False)
 .agg({'PassengerGroup':'count'})
 .pivot(index = 'PassengerNo', columns = 'VIP', values = 'PassengerGroup')
 .fillna(0)
 .rename(columns = {False: 'False', True:'True'})
 .assign(VIP = lambda x: x['True']/(x['True'] + x['False'])))