In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df_train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Split Cabin to Deck/Num/Side
# Side = P for Port, or S for Starboard
cabin = df_train['Cabin'].str.split('/', expand=True)
df_train['Cabin.Deck'] = cabin[0]
df_train['Cabin.Num'] = cabin[1]
df_train['Cabin.Side'] = cabin[2]

cabin = df_test['Cabin'].str.split('/', expand=True)
df_test['Cabin.Deck'] = cabin[0]
df_test['Cabin.Num'] = cabin[1]
df_test['Cabin.Side'] = cabin[2]

In [None]:
df_train.info()

## How many nulls are there in each column?

In [None]:
df_train.isna().sum()

## What is the proportion of null values for each column?

In [None]:
nulls = df_train.isna().sum()
totals = df_train.count()

nulls / totals

## Splitting out a few columns

In [None]:
XTrain = df_train.loc[ : , ~df_train.columns.isin(['PassengerId', 'Cabin'])]

numerical = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall','Spa', 'VRDeck', 'Cabin.Num']
categorical = ['HomePlanet', 'CryoSleep', 'Destination', 'Cabin.Deck', 'Cabin.Side', 'Transported']

XTrain = XTrain.astype({
    'Age': float,
    'RoomService': float,
    'FoodCourt': float,
    'ShoppingMall': float,
    'Spa': float,
    'VRDeck': float,
    'Cabin.Num': float
})

In [None]:
sns.countplot(x=XTrain['Transported'])

About half of the ship was transported.

In [None]:
XTrain[numerical].hist()

In [None]:
sns.set_theme(style='whitegrid', font_scale=1.2)
sns.histplot(XTrain['Age'], binwidth=5)

Most numerical values (except for Age) are 0. The majority of those on the ship did not pay for additional services.

Curious about the survival rate of those who did pay for services. Perhaps also look into VIP and paid services.

In [None]:
had_service = np.where(XTrain['RoomService'] > 0, True, False)
had_service = pd.Series(had_service, name='Had RoomService')

had_food = np.where(XTrain['FoodCourt'] > 0, True, False)
had_food = pd.Series(had_food, name='Had FoodCourt')

had_shop = np.where(XTrain['ShoppingMall'] > 0, True, False)
had_shop = pd.Series(had_shop, name='Had ShoppingMall')

had_spa = np.where(XTrain['Spa'] > 0, True, False)
had_spa = pd.Series(had_spa, name='Had Spa')

had_vr = np.where(XTrain['VRDeck'] > 0, True, False)
had_vr = pd.Series(had_vr, name='Had VRDeck')

no_services = np.where((XTrain['RoomService'] == 0) &
                       (XTrain['FoodCourt'] == 0) &
                       (XTrain['ShoppingMall'] == 0) &
                       (XTrain['Spa'] == 0) &
                       (XTrain['VRDeck'] == 0), True, False)
no_services = pd.Series(no_services, name='No Services')

In [None]:
pd.crosstab(had_service, XTrain['Transported'], normalize='index')

In [None]:
pd.crosstab(had_food, XTrain['Transported'], normalize='index')

In [None]:
pd.crosstab(had_shop, XTrain['Transported'], normalize='index')

In [None]:
pd.crosstab(had_spa, XTrain['Transported'], normalize='index')

In [None]:
pd.crosstab(had_vr, XTrain['Transported'], normalize='index')

In [None]:
pd.crosstab(no_services, XTrain['Transported'], normalize='index')

There seems to be a high percentage of those who did not pay for additional services, and those who were transported.

Those who did not pay for additional services were more likely to be transported. Of those who did not pay for any services, 78% were transported.

It seems that those who took advantage of the most luxurious ammenities (room service, spa, VR) had the lowest chance of being transported.

In [None]:
pd.crosstab(XTrain['VIP'], XTrain['Transported'], normalize='index')

Those who had VIP status were somewhat less likely to be transported.

In [None]:
pd.crosstab(XTrain['CryoSleep'], XTrain['Transported'], normalize='index')

Wow, out of those who were in cryo sleep, 82% of them were transported. Perhaps because they couldn't move to the part of the ship that was safe?

In [None]:
pd.crosstab(XTrain['HomePlanet'], XTrain['Transported'], normalize='index')

Seems like there might be a correlation between those from Europa and being transported? Might see if there's a correlation with other columns, such as CryoSleep, Cabin, etc.

In [None]:
pd.crosstab(XTrain['Destination'], XTrain['Transported'], normalize='index')

There might also be a correlation with those whose desination was 55 Cancri e. Might be worth looking into as well.

In [None]:
pd.crosstab(XTrain['Cabin.Deck'], XTrain['Transported'])

In [None]:
pd.crosstab(XTrain['Cabin.Deck'], XTrain['Transported'], normalize='index')

In [None]:
XTrain['Cabin.Deck'].value_counts().sort_index()

Decks that were more likely to be transported:
* B(73% out of 779)
* C(68% out of 747)

Decks that were less likely to be transported:
* E(64% out of 876)
* T(80% out of 5, not very influential)

Decks that didn't have strong correlation either way:
* A - 256 people
* D - 478 people
* F - 2794 people
* G - 2559 people

In [None]:
pd.crosstab(XTrain['Cabin.Side'], XTrain['Transported'], normalize='index')

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(20, 10))
plt.subplots_adjust(hspace=0.4)
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(x=XTrain[variable], ax=subplot)

Going back to some previous questions, I'm curious if there's any correlation that can be found when a passenger's HomePlanet = Europa or Destination = 55 Cancri e

In [None]:
passenger_europa = XTrain.loc[XTrain['HomePlanet'] == 'Europa']

fig, ax = plt.subplots(2, 3, figsize=(20, 10))
plt.subplots_adjust(hspace=0.4)
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(x=passenger_europa[variable], ax=subplot)

A lot of passengers whose home planet is Europa seems to have stayed in cabins B and C. Those were decks that were more likely to be transported.

In [None]:
passenger_cancrie = XTrain.loc[XTrain['Destination'] == '55 Cancri e']

fig, ax = plt.subplots(2, 3, figsize=(20, 10))
plt.subplots_adjust(hspace=0.4)
for variable, subplot in zip(categorical, ax.flatten()):
    sns.countplot(x=passenger_cancrie[variable], ax=subplot)

It seems those whose destination was "55 Cancri e" were in decks B and C (who were likely to get transported) and also in decks F and G, who had roughly 50% transported.

In [None]:
sns.histplot(XTrain['Cabin.Num'])

Seems a lot of people were in lower-numbered cabins. Could just be that all decks start at 0 and go up to the max number of rooms in that deck. Let's see how many rooms there were on each deck.

In [None]:
XTrain.groupby(['Cabin.Deck'])['Cabin.Num'].count()

In [None]:
bin_sizes = list(range(1, 112, 10))
cabin_A = XTrain.loc[XTrain['Cabin.Deck'] == 'A', 'Cabin.Num']
bins = pd.cut(cabin_A, bin_sizes)

sns.countplot(bins)
plt.xticks(rotation=45)

In [None]:
bin_sizes = list(range(1, 1902, 100))
bins_all_decks = pd.cut(XTrain['Cabin.Num'], bin_sizes)
pd.crosstab(bins_all_decks, XTrain['Transported'])

In [None]:
pd.crosstab(bins_all_decks, XTrain['Transported'], normalize='index')

There might be some correlation between cabin num and transported, but not very strong evidence.

## Comparing numerical against categorical variables

In [None]:
plt.subplots_adjust(hspace=0.6)
services = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for var2 in numerical:
    if var2 in services:
        new_var = XTrain.loc[XTrain[var2] > 0, var2]
        print(new_var.shape)
    else:
        new_var = var2
    fig, ax = plt.subplots(2, 3, figsize=(15, 10))
    for var, subplot in zip(categorical, ax.flatten()):
        sns.boxplot(x=var, y=new_var, data=XTrain, ax=subplot)
#         for label in subplot.get_xticklabels():
#             label.set_rotation(90)

## Comparing numerical variables

In [None]:
for variable2 in numerical:
    fig, ax = plt.subplots(3, 3, figsize=(20, 10))
    plt.subplots_adjust(hspace=0.4)
    for variable, subplot in zip(numerical, ax.flatten()):
        sns.scatterplot(x=XTrain[variable2], y=XTrain[variable], ax=subplot)