In [1]:
import numpy as np
import pandas as pd
import os

# List input files to confirm the correct paths
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load training and test datasets
train_df = pd.read_csv('/kaggle/input/hits-2024-spaceship-titanic-2nd-module/train.csv')
test_df = pd.read_csv('/kaggle/input/hits-2024-spaceship-titanic-2nd-module/test.csv')

# Display the first few rows of the training dataset
train_df.head()


/kaggle/input/hits-2024-spaceship-titanic-2nd-module/sample_submission.csv
/kaggle/input/hits-2024-spaceship-titanic-2nd-module/train.csv
/kaggle/input/hits-2024-spaceship-titanic-2nd-module/test.csv


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [2]:
# Check for missing values
print(train_df.isnull().sum())

# Summarize numerical features
print(train_df.describe())

# Display data types of each column
print(train_df.dtypes)



PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64
               Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  8514.000000   8512.000000   8510.000000   8485.000000   8510.000000   
mean     28.827930    224.687617    458.077203    173.729169    311.138778   
std      14.489021    666.717663   1611.489240    604.696458   1136.705535   
min       0.000000      0.000000      0.000000      0.000000      0.000000   
25%      19.000000      0.000000      0.000000      0.000000      0.000000   
50%      27.000000      0.000000      0.000000      0.000000      0.000000   
75%      38.000000     47.000000     76.000000     27.000000     59.000000   
max      79.000000  14327.000000  29813.000000  23492.000000  22408.000000   

    

In [3]:
# Define categorical and numerical columns
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Fill missing categorical values with mode
for col in categorical_cols:
    if col in train_df.columns: 
        train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
        test_df[col] = test_df[col].fillna(test_df[col].mode()[0])

# Fill missing numerical values with median
for col in numerical_cols:
    if col in train_df.columns: 
        train_df[col] = train_df[col].fillna(train_df[col].median())
        test_df[col] = test_df[col].fillna(test_df[col].median())

# Encode categorical variables with one-hot encoding
train_df = pd.get_dummies(train_df, columns=[col for col in categorical_cols if col in train_df.columns], drop_first=True)
test_df = pd.get_dummies(test_df, columns=[col for col in categorical_cols if col in test_df.columns], drop_first=True)

# Ensure train and test sets have the same columns
test_df = test_df.reindex(columns=train_df.columns, fill_value=0)


  train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
  test_df[col] = test_df[col].fillna(test_df[col].mode()[0])


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define features and target
X = train_df.drop(columns=['PassengerId', 'Name', 'Transported'], errors='ignore')
y = train_df['Transported']

# Train/test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)

# Evaluate performance
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_report(y_val, y_pred))


Accuracy: 0.7930
Classification Report:
              precision    recall  f1-score   support

       False       0.79      0.79      0.79       861
        True       0.79      0.80      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [5]:
# Prepare test data for predictions
X_test = test_df.drop(columns=['PassengerId', 'Name', 'Transported'], errors='ignore')

# Generate predictions
test_df['Transported'] = model.predict(X_test)

# Create a submission file
submission = test_df[['PassengerId', 'Transported']]
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created successfully!")


Submission file created successfully!
