<a href="https://colab.research.google.com/github/priyavanshjn/mini-project-titanic-ml-from-disaster/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Load training and test datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Save PassengerId for submission
passenger_ids = test_df['PassengerId']

# Drop irrelevant columns
train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Handle missing values
imputer = SimpleImputer(strategy='median')
train_df['Age'] = imputer.fit_transform(train_df[['Age']])
test_df['Age'] = imputer.transform(test_df[['Age']])
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
test_df['Embarked'] = test_df['Embarked'].fillna(train_df['Embarked'].mode()[0])

# Feature engineering
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# One-hot encoding
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'], drop_first=True)

# Ensure both datasets have same columns
missing_cols = set(train_df.columns) - set(test_df.columns)
missing_cols.discard('Survived')
for col in missing_cols:
    test_df[col] = 0
test_df = test_df[train_df.drop('Survived', axis=1).columns]

# Split data for evaluation
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on validation set and evaluate
y_val_pred = model.predict(X_val)
y_val_pred_binary = (y_val_pred >= 0.5).astype(int)
accuracy = accuracy_score(y_val, y_val_pred_binary)
print(f"Validation Accuracy (thresholded): {accuracy:.4f}")

# Predict on test set
test_predictions = model.predict(test_df)
test_predictions_binary = (test_predictions >= 0.5).astype(int)

# Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': test_predictions_binary
})

# Save to CSV with required format
submission.to_csv("PRIYAVANSH_JAIN.csv", index=False)
print("Submission file 'PRIYAVANSH_JAIN' created successfully.")


Validation Accuracy (thresholded): 0.8268
Submission file 'PRIYAVANSH_JAIN' created successfully.
