# 1. Data Extract

## 1.1. Imports

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./data/spaceship_titanic.csv')

## 1.2. First look at the data

In [3]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [5]:
data.describe(include='all')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
count,8693,8492,8476,8494,8511,8514.0,8490,8512.0,8510.0,8485.0,8510.0,8505.0,8493,8693
unique,8693,3,2,6560,3,,2,,,,,,8473,2
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,,False,,,,,,Gollux Reedall,True
freq,1,4602,5439,8,5915,,8291,,,,,,2,4378
mean,,,,,,28.82793,,224.687617,458.077203,173.729169,311.138778,304.854791,,
std,,,,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,,
min,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,
25%,,,,,,19.0,,0.0,0.0,0.0,0.0,0.0,,
50%,,,,,,27.0,,0.0,0.0,0.0,0.0,0.0,,
75%,,,,,,38.0,,47.0,76.0,27.0,59.0,46.0,,


## 1.3. Extracting Information

### 1.3.1. Extracting information from id

PassengerId column contains a unique for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

In [6]:
def explode_id(row: pd.Series) -> pd.Series:
    row['GroupId'] = int(row['PassengerId'].split('_')[0])
    row['PassengerIdInGroup'] = int(row['PassengerId'].split('_')[1])
    return row

data = data.apply(explode_id, axis=1)
data[['PassengerId', 'GroupId', 'PassengerIdInGroup']].head()

Unnamed: 0,PassengerId,GroupId,PassengerIdInGroup
0,0001_01,1,1
1,0002_01,2,1
2,0003_01,3,1
3,0003_02,3,2
4,0004_01,4,1


Moreover, the amount of members in each group can be important for prediction of interdimensional transportation

In [7]:
for unique_group in data['GroupId'].unique():
    group_slice = data[data['GroupId'] == unique_group]
    data.loc[group_slice.index, 'MembersOfGroupById'] = group_slice.shape[0]
    
data[['PassengerId', 'GroupId', 'PassengerIdInGroup', 'MembersOfGroupById']].head()

Unnamed: 0,PassengerId,GroupId,PassengerIdInGroup,MembersOfGroupById
0,0001_01,1,1,1.0
1,0002_01,2,1,1.0
2,0003_01,3,1,2.0
3,0003_02,3,2,2.0
4,0004_01,4,1,1.0


### 1.3.2. Extracting information from passengers' name

Name contains passengers' surname which can give us the idea of how much family members are on board for each passenger.

In [8]:
def extract_surname(row: pd.Series) -> pd.Series:
    name = row['Name']
    if not pd.isna(name):
        row['Surname'] = name.split(' ')[1]
    else:
        row['Surname'] = 'Unknown'
    return row
        
data = data.apply(extract_surname, axis=1)

for unique_surname in data['Surname'].unique():
    surname_slice = data[data['Surname'] == unique_surname]
    if unique_surname != 'Unknown':
        data.loc[surname_slice.index, 'MembersOfFamilyBySurname'] = surname_slice.shape[0]
    else:
        data.loc[surname_slice.index, 'MembersOfFamilyBySurname'] = np.nan

data[['Name', 'Surname', 'MembersOfFamilyBySurname']].head()

Unnamed: 0,Name,Surname,MembersOfFamilyBySurname
0,Maham Ofracculy,Ofracculy,1.0
1,Juanna Vines,Vines,4.0
2,Altark Susent,Susent,6.0
3,Solam Susent,Susent,6.0
4,Willy Santantines,Santantines,6.0


### 1.3.3. Splitting the Cabin number

As it is stated in the description, the cabin number column contains three parts: deck/num/side, where side can be either P for Port or S for Starboard.

In [9]:
def split_cabin(row: pd.Series) -> pd.Series:
    cabin = row['Cabin']
    for i, name in enumerate(['Deck', 'Num', 'Side']):
        if not pd.isna(cabin):
            row[f'Cabin{name}'] = cabin.split('/')[i]
        else:
            row[f'Cabin{name}'] = 'Unknown'
    return row

data = data.apply(split_cabin, axis=1)

data[['Cabin', 'CabinDeck', 'CabinNum', 'CabinSide']]

Unnamed: 0,Cabin,CabinDeck,CabinNum,CabinSide
0,B/0/P,B,0,P
1,F/0/S,F,0,S
2,A/0/S,A,0,S
3,A/0/S,A,0,S
4,F/1/S,F,1,S
...,...,...,...,...
8688,A/98/P,A,98,P
8689,G/1499/S,G,1499,S
8690,G/1500/S,G,1500,S
8691,E/608/S,E,608,S


In [10]:
data['CabinSide'].unique()

array(['P', 'S', 'Unknown'], dtype=object)

### 1.3.4. Binning the Age column

Age is split in five categories:
- Child (0, 12]
- Teenager (12, 18]
- Young (18, 30]
- Senior (30, 60]
- Elder (60, inf]

In [11]:
def bin_age(row: pd.Series) -> pd.Series:
    age = row['Age']
    if age <= 12:
        row['AgeBinned'] = 'Child'
    elif age <= 18:
        row['AgeBinned'] = 'Teenager'
    elif age <= 30:
        row['AgeBinned'] = 'Young'
    elif age <= 60:
        row['AgeBinned'] = 'Senior'
    else:
        row['AgeBinned'] = 'Elder'
    return row

data = data.apply(bin_age, axis=1)

### 1.4. Saving

In [12]:
data.to_csv('./data/spaceship_titanic_e.csv', index=False)