In [None]:
import pandas as pd

df = pd.read_csv ("train.csv")
test = pd.read_csv ("test.csv")

In [None]:
print(train.isnull().sum())

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
print("HomePlanet vs Destination")
print(pd.crosstab(test['HomePlanet'], test['Destination'], normalize='index').round(2))

HomePlanet vs Destination
Destination  55 Cancri e  PSO J318.5-22  TRAPPIST-1e
HomePlanet                                          
Earth               0.13           0.16         0.71
Europa              0.43           0.01         0.56
Mars                0.11           0.02         0.87


In [None]:
print("HomePlanet vs CryoSleep")
print(pd.crosstab(test['HomePlanet'], test['CryoSleep'], normalize='index').round(2))

HomePlanet vs CryoSleep
CryoSleep   False  True 
HomePlanet              
Earth        0.69   0.31
Europa       0.53   0.47
Mars         0.60   0.40


In [None]:
print("HomePlanet vs VIP")
print(pd.crosstab(test['HomePlanet'], test['VIP'], normalize='index').round(2))

HomePlanet vs VIP
VIP         False  True 
HomePlanet              
Earth        1.00   0.00
Europa       0.95   0.05
Mars         0.97   0.03


In [None]:

most_common_planet_per_dest = df.groupby('Destination')['HomePlanet'].agg(lambda x: x.mode()[0])
df['HomePlanet'] = df.apply(
    lambda row: most_common_planet_per_dest[row['Destination']] if pd.isna(row['HomePlanet']) and pd.notna(row['Destination']) else row['HomePlanet'],
    axis=1
)

df['HomePlanet'] = df['HomePlanet'].fillna(df['HomePlanet'].mode()[0])



In [None]:

spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df['CryoSleep'] = df.apply(
    lambda row: True if pd.isna(row['CryoSleep']) and all(row[col] == 0 or pd.isna(row[col]) for col in spending_cols) else row['CryoSleep'],
    axis=1
)


df['CryoSleep'] = df['CryoSleep'].fillna(df['CryoSleep'].mode()[0]).astype(bool)



  df['CryoSleep'] = df['CryoSleep'].fillna(df['CryoSleep'].mode()[0]).astype(bool)


In [None]:
df[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = df['Cabin'].str.split('/', expand=True)
df.drop(columns='Cabin', inplace=True)


df['Cabin_Deck'] = df['Cabin_Deck'].fillna('Unknown')
df['Cabin_Side'] = df['Cabin_Side'].fillna('Unknown')
df['Cabin_Num'] = df['Cabin_Num'].fillna(-1).astype(int)


In [None]:
most_common_dest_per_planet = df.groupby('HomePlanet')['Destination'].agg(lambda x: x.mode()[0])
df['Destination'] = df.apply(
    lambda row: most_common_dest_per_planet[row['HomePlanet']] if pd.isna(row['Destination']) and pd.notna(row['HomePlanet']) else row['Destination'],
    axis=1
)


df['Destination'] = df['Destination'].fillna(df['Destination'].mode()[0])


In [None]:
df['Age'] = df['Age'].fillna(df['Age'].median())


In [None]:
df['VIP'] = df['VIP'].fillna(df['VIP'].mode()[0])


  df['VIP'] = df['VIP'].fillna(df['VIP'].mode()[0])


In [None]:
spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for col in spending_cols:
    df[col] = df[col].fillna(0)


In [None]:
df.drop(columns='Name', inplace=True)


In [None]:
df['CryoSleep'] = df['CryoSleep'].astype(bool).astype(int)
df['VIP'] = df['VIP'].astype(bool).astype(int)


In [None]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
HomePlanet,0
CryoSleep,0
Destination,0
Age,0
VIP,0
RoomService,0
FoodCourt,0
ShoppingMall,0
Spa,0


In [None]:
df.dtypes


Unnamed: 0,0
PassengerId,object
HomePlanet,object
CryoSleep,int64
Destination,object
Age,float64
VIP,int64
RoomService,float64
FoodCourt,float64
ShoppingMall,float64
Spa,float64


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


X = df.drop(columns=['PassengerId', 'Transported'])
y = df['Transported'].astype(int)

In [None]:
X = pd.get_dummies(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print(" Accuracy:", accuracy_score(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))

 Accuracy: 0.7981598619896493

 Confusion Matrix:
 [[704 157]
 [194 684]]

 Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.82      0.80       861
           1       0.81      0.78      0.80       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



In [None]:
import pandas as pd


test_df = pd.read_csv('test.csv')


test_df[['Cabin_Deck', 'Cabin_Num', 'Cabin_Side']] = test_df['Cabin'].str.split('/', expand=True)
test_df.drop(['Cabin', 'Name'], axis=1, inplace=True)


test_df.fillna(0, inplace=True)


X_test = pd.get_dummies(test_df)


X_test = X_test.loc[:, ~X_test.columns.duplicated()]


X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Predict
y_pred = rf_model.predict(X_test)


submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': y_pred.astype(bool)
})


submission.to_csv('submission.csv', index=False)

print(submission.head())


  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
