In [19]:
import pandas as pd  # For handling data
import numpy as np   # For numerical operations (if needed)
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For better visualizations

In [20]:
df = pd.read_csv("space_titanic.csv")  # Load the dataset

In [21]:
df.head()  # Show the first 5 rows

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [22]:
df.info()  # Overview of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [23]:
df.isnull().sum()  # Count missing values in each column

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [24]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [25]:
# Fill missing values
df["Age"].fillna(df["Age"].median(), inplace=True)
df["HomePlanet"].fillna(df["HomePlanet"].mode()[0], inplace=True)
df["CryoSleep"].fillna(False, inplace=True)
df["VIP"].fillna(False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["HomePlanet"].fillna(df["HomePlanet"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whic

In [26]:
# Drop rows where Transported is missing (as it's our target)
df.dropna(subset=["Transported"], inplace=True)
print(df.isnull().sum())

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination     182
Age               0
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [27]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [28]:
# 1. One-Hot Encoding for categorical columns
df = pd.get_dummies(df, columns=["HomePlanet", "Destination"], drop_first=True)

In [29]:
# 2. Extract meaningful info from 'Cabin' (if not missing)
df["Deck"] = df["Cabin"].apply(lambda x: str(x).split("/")[0] if pd.notna(x) else "Unknown")
df["CabinNum"] = df["Cabin"].apply(lambda x: str(x).split("/")[1] if (pd.notna(x) and len(str(x).split("/")) > 1 ) else "0").astype(int)
df.drop("Cabin", axis=1, inplace=True)  # Drop the original column

In [30]:
# 3. Normalize numerical columns
scaler = StandardScaler()
num_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "CabinNum"]
df[num_cols] = df[num_cols].fillna(0)
df[num_cols] = scaler.fit_transform(df[num_cols])

In [31]:
# Convert True/False columns to 1/0
df["CryoSleep"] = df["CryoSleep"].astype(int)
df["VIP"] = df["VIP"].astype(int)
df["Transported"] = df["Transported"].astype(int)  # Target column

In [32]:
# Check final dataset
print(df.head())

  PassengerId  CryoSleep       Age  VIP  RoomService  FoodCourt  ShoppingMall  \
0     0001_01          0  0.711945    0    -0.333105  -0.281027     -0.283579   
1     0002_01          0 -0.334037    0    -0.168073  -0.275387     -0.241771   
2     0003_01          0  2.036857    1    -0.268001   1.959998     -0.283579   
3     0003_02          0  0.293552    0    -0.333105   0.523010      0.336851   
4     0004_01          0 -0.891895    0     0.125652  -0.237159     -0.031059   

        Spa    VRDeck               Name  Transported  HomePlanet_Europa  \
0 -0.270626 -0.263003    Maham Ofracculy            0               True   
1  0.217158 -0.224205       Juanna Vines            1              False   
2  5.695623 -0.219796      Altark Susent            0               True   
3  2.687176 -0.092818       Solam Susent            0               True   
4  0.231374 -0.261240  Willy Santantines            1              False   

   HomePlanet_Mars  Destination_PSO J318.5-22  Destinati

In [33]:
# Step 4: Train a Machine Learning Model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [42]:
# Define feature columns (excluding PassengerId, Name, and target variable)
X = df.drop(columns=['PassengerId', 'Name', 'Transported'])

# Define target variable
y = df['Transported']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# One-hot encode the Deck column
X_train = pd.get_dummies(X_train, columns=['Deck'])
X_test = pd.get_dummies(X_test, columns=['Deck'])

# Align train and test in case they end up with different columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Check data types in X_train
print(X_train.dtypes)

CryoSleep                      int64
Age                          float64
VIP                            int64
RoomService                  float64
FoodCourt                    float64
ShoppingMall                 float64
Spa                          float64
VRDeck                       float64
HomePlanet_Europa               bool
HomePlanet_Mars                 bool
Destination_PSO J318.5-22       bool
Destination_TRAPPIST-1e         bool
CabinNum                     float64
Deck_A                          bool
Deck_B                          bool
Deck_C                          bool
Deck_D                          bool
Deck_E                          bool
Deck_F                          bool
Deck_G                          bool
Deck_T                          bool
Deck_Unknown                    bool
dtype: object


In [45]:
# Initialize model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7751581368602645

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.80      0.78       861
           1       0.79      0.75      0.77       878

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739

