# Spaceship Titanic - Data Exploration

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
print('libraries imported')

#functions:
def print_missing(column):
    print(f'Missing values left in {column.name}: {column.isna().sum()}')
    

    
    


libraries imported


In [32]:
train = pd.read_csv('data/raw/train.csv')
print("Data Frame shape: ", train.shape)
print("Data Frame columns: ",train.columns)
print('\nData Frame example: ')
print(train.dtypes)
train.head(5)

Data Frame shape:  (8693, 14)
Data Frame columns:  Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

Data Frame example: 
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


## Lets handle the missing data

In [33]:
def is_data_missing(df):
    missing = df.isnull().sum()
    missing_perc = (missing / len(df) * 100).round(2)
    
    missing_df = pd.DataFrame({
        'Count': missing,
        'Percent': missing_perc
    })
    print(missing_df.sort_values('Count'))
is_data_missing(train)


              Count  Percent
PassengerId       0     0.00
Transported       0     0.00
Age             179     2.06
RoomService     181     2.08
Destination     182     2.09
FoodCourt       183     2.11
Spa             183     2.11
VRDeck          188     2.16
Cabin           199     2.29
Name            200     2.30
HomePlanet      201     2.31
VIP             203     2.34
ShoppingMall    208     2.39
CryoSleep       217     2.50


In [34]:
#CryoSleep: A passenger in cryosleep cannot spend money on amenities: RoomService, FoodCourt, ShoppingMall, Spa	VRDeck
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa',	'VRDeck']
train['Total spending'] = train[amenities].sum(axis=1)
train.loc[(train['CryoSleep'].isna()) & (train['Total spending']==0),'CryoSleep'] = False
train.loc[(train['CryoSleep'].isna()) & (train['Total spending']>0),'CryoSleep'] = True
print_missing(train['CryoSleep'])

Missing values left in CryoSleep: 0


In [35]:
#Shopping Mall, VRDeck, Spa, FoodCourt, RoomService

def fill_amenities(df,columns):
    for c in columns:
        df.loc[(df['CryoSleep']==True) & (df[c].isna()),c] = 0
        median = df.loc[(df['CryoSleep']==False),c].median()
        df.loc[df[c].isna(),c] = median
        
#a passenger in cryosleep cant spend money, replace NaN with 0 
#for passenger not in cryosleep replace missing value with median spending in each amenity
fill_amenities(train,amenities)

print_missing(train['ShoppingMall'])

Missing values left in ShoppingMall: 0


In [36]:
#VIP
train['VIP'].value_counts()
#most passenger aren't VIP, so we'll replace missing values with False
train.loc[train['VIP'].isna(), 'VIP'] = False

print_missing(train['VIP'])


Missing values left in VIP: 0


In [37]:
#Home Planet
home_planet = train.groupby('HomePlanet')['HomePlanet'].count()
print(home_planet)
#most passenger came from Earth, so we'll replace missing values with 'Earth'
train['HomePlanet'] = train['HomePlanet'].fillna('Earth')
print_missing(train['HomePlanet'])

HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64
Missing values left in HomePlanet: 0


In [38]:
#Name
train['Name'] = train['Name'].fillna('Unknown')
#train['Name'].isna().sum()
print_missing(train['Name'])

Missing values left in Name: 0


In [39]:
#Cabin
#split Cabin into deck/num/side columnds
train[['Deck','CabinNum','Side']] = train['Cabin'].str.split('/',expand=True)

train['Deck'] = train['Deck'].fillna(train['Deck'].mode()[0])
train['CabinNum'] = train['CabinNum'].fillna(train['CabinNum'].mode()[0])
train['Side'] = train['Side'].fillna(train['Side'].mode()[0])

#drop Cabin columns as its no longer needed
train = train.drop('Cabin',axis=1)
#print(train.columns)

train[['Deck','CabinNum','Side']].head(5)
print_missing(train['Deck'])
print_missing(train['CabinNum'])
print_missing(train['Side'])



Missing values left in Deck: 0
Missing values left in CabinNum: 0
Missing values left in Side: 0


In [40]:
#Age
vips_age = train.groupby(['VIP','HomePlanet'])['Age'].agg(['median','count'])
vips_age.columns =['Median Age', 'Count']
print(vips_age)

train['Age'] = train.groupby(['VIP','HomePlanet'])['Age'].transform(
    lambda x: x.fillna(x.median())
)
train['Age'] = train['Age'].fillna(train['Age'].median())
print_missing(train['Age'])


                  Median Age  Count
VIP   HomePlanet                   
False Earth             24.0   4708
      Europa            32.0   1954
      Mars              28.0   1654
True  Earth             32.0      5
      Europa            35.0    131
      Mars              32.0     62
Missing values left in Age: 0


In [41]:
destinations = train.groupby('HomePlanet')['Destination'].value_counts()
print(destinations)

train['Destination'] = train['Destination'].fillna(train['Destination'].mode()[0])
print_missing(train['Destination'])

HomePlanet  Destination  
Earth       TRAPPIST-1e      3251
            PSO J318.5-22     728
            55 Cancri e       721
Europa      TRAPPIST-1e      1189
            55 Cancri e       886
            PSO J318.5-22      19
Mars        TRAPPIST-1e      1475
            55 Cancri e       193
            PSO J318.5-22      49
Name: count, dtype: int64
Missing values left in Destination: 0


In [42]:
is_data_missing(train)

                Count  Percent
PassengerId         0      0.0
Deck                0      0.0
Total spending      0      0.0
Transported         0      0.0
Name                0      0.0
VRDeck              0      0.0
Spa                 0      0.0
CabinNum            0      0.0
ShoppingMall        0      0.0
RoomService         0      0.0
VIP                 0      0.0
Age                 0      0.0
Destination         0      0.0
CryoSleep           0      0.0
HomePlanet          0      0.0
FoodCourt           0      0.0
Side                0      0.0


### Convert object features into ints

In [43]:
train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Total spending,Deck,CabinNum,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,0.0,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,736.0,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,10383.0,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,5176.0,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,1091.0,F,1,S


In [44]:
cols_to_convert = ['CryoSleep', 'VIP', 'Transported']
train[cols_to_convert] = train[cols_to_convert].astype(int)
train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Total spending,Deck,CabinNum,Side
0,0001_01,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,0.0,B,0,P
1,0002_01,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,736.0,F,0,S
2,0003_01,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,10383.0,A,0,S
3,0003_02,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,5176.0,A,0,S
4,0004_01,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,1091.0,F,1,S


## Feature engineering

In [45]:
#HomePlanet one-hot
h_planet_dummies = pd.get_dummies(train['HomePlanet'],prefix = 'From').astype(int)
train = pd.concat([train,h_planet_dummies],axis=1)

In [46]:
#Destination one-hot
destination_dummies = pd.get_dummies(train['Destination'],prefix = 'To').astype(int)
train = pd.concat([train,destination_dummies],axis=1)

In [47]:
#Deck one-hot
deck_dummies = pd.get_dummies(train['Deck'],prefix = 'Deck').astype(int)
train = pd.concat([train,deck_dummies],axis=1)

In [48]:
#Side one-hot
side_dummies = pd.get_dummies(train['Side'],prefix='Side').astype(int)
train = pd.concat([train,side_dummies],axis=1)
train.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,...,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,0001_01,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
1,0002_01,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,...,0,0,0,0,0,1,0,0,0,1
2,0003_01,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,...,1,0,0,0,0,0,0,0,0,1
3,0003_02,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,...,1,0,0,0,0,0,0,0,0,1
4,0004_01,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,...,0,0,0,0,0,1,0,0,0,1


## Who got transported?

In [49]:
train['Transported'].value_counts()

Transported
1    4378
0    4315
Name: count, dtype: int64

In [50]:
train.groupby('HomePlanet')['Transported'].count()

HomePlanet
Earth     4803
Europa    2131
Mars      1759
Name: Transported, dtype: int64

In [51]:
# Ile osób zostało transported
print(train['Transported'].value_counts())
print(f"\nTransported rate: {train['Transported'].mean()*100:.1f}%")

# Kto został transported - podstawowe statystyki
print("\n=== Age ===")
print(train.groupby('Transported')['Age'].describe())

print("\n=== HomePlanet ===")
print(pd.crosstab(train['HomePlanet'], train['Transported'], normalize='index')*100)

print("\n=== CryoSleep ===")
print(pd.crosstab(train['CryoSleep'], train['Transported'], normalize='index')*100)

print("\n=== VIP ===")
print(pd.crosstab(train['VIP'], train['Transported'], normalize='index')*100)

print("\n=== Destination ===")
print(pd.crosstab(train['Destination'], train['Transported'], normalize='index')*100)

print("\n=== Spending ===")
print(train.groupby('Transported')['Total spending'].describe())

Transported
1    4378
0    4315
Name: count, dtype: int64

Transported rate: 50.4%

=== Age ===
              count       mean        std  min   25%   50%   75%   max
Transported                                                           
0            4315.0  29.848899  13.574119  0.0  21.0  27.0  38.0  79.0
1            4378.0  27.748515  15.003359  0.0  18.0  26.0  37.0  78.0

=== HomePlanet ===
Transported          0          1
HomePlanet                       
Earth        57.235061  42.764939
Europa       34.115439  65.884561
Mars         47.697555  52.302445

=== CryoSleep ===
Transported          0          1
CryoSleep                        
0            66.407802  33.592198
1            20.215463  79.784537

=== VIP ===
Transported          0          1
VIP                              
0            49.352484  50.647516
1            61.809045  38.190955

=== Destination ===
Transported            0          1
Destination                        
55 Cancri e    39.000000  61.0000

# Prepare data for training

In [53]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


features = [
    'CryoSleep', 'VIP', 
    #HomePlanet
    'From_Europa', 'From_Mars', 'From_Earth',
    #Destination
    'To_55 Cancri e','To_PSO J318.5-22', 'To_TRAPPIST-1e',
    #Deck
    'Deck_A','Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T',
    #Side
    'Side_P', 'Side_S'
    
]
X = train[features]
Y = train['Transported']

#train/validation split
X_train, X_val, Y_train, Y_val = train_test_split(
    X,Y,
    test_size=0.2,
    random_state=42
)

#data scaling (Standard Scaler)
scaler = StandardScaler()
scaler.fit(X_train)

#update data after scaling
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Random Forrest model

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


rf_model = RandomForestClassifier(
    n_estimators=500,      # num of trees
    max_depth=10,          # max tree depth
    min_samples_split=5,   # min samples to split
    random_state=42
)

rf_model.fit(X_train,Y_train)

cv_scores = cross_val_score(rf_model, X_train, Y_train, cv=5, scoring='accuracy')
print(f"RF - CV mean: {cv_scores.mean():.2%}, standard deviation: {cv_scores.std():.2%}")


rf_predictions = rf_model.predict(X_val)

feature_importance = sorted(list(zip(features, rf_model.feature_importances_)),
                           key=lambda x: x[1], reverse=True)
for name, importance in feature_importance:
    print(f'{name:<15}: {importance*100:.3f}')

RF - CV mean: 71.58%, standard deviation: 1.02%
CryoSleep      : 61.323
From_Earth     : 5.279
From_Europa    : 3.993
Deck_F         : 3.776
Deck_E         : 3.553
Deck_G         : 2.591
Side_P         : 2.432
Side_S         : 2.351
To_TRAPPIST-1e : 2.263
Deck_C         : 2.065
From_Mars      : 2.018
Deck_B         : 1.995
To_55 Cancri e : 1.924
VIP            : 1.733
To_PSO J318.5-22: 1.069
Deck_D         : 0.982
Deck_A         : 0.495
Deck_T         : 0.160
