In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score

# Load the data
train_data = pd.read_csv("/content/drive/MyDrive/train_titanic.csv")
test_data = pd.read_csv("/content/drive/MyDrive/test_titanic.csv")

# Feature Engineering
# Combine "SibSp" and "Parch" into a new feature "FamilySize"
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Extract titles from names
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Map titles to categories
title_mapping = {
    "Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Dr": 4, "Rev": 4, "Col": 4, "Major": 4,
    "Mlle": 1, "Mme": 2, "Don": 4, "Lady": 5, "Countess": 5, "Jonkheer": 5, "Sir": 5,
    "Capt": 4, "Ms": 1
}

# Fill missing age values with overall median age
overall_median_age = train_data['Age'].median()
train_data['Age'].fillna(overall_median_age, inplace=True)
test_data['Age'].fillna(overall_median_age, inplace=True)

# Fill missing fare value in test data with median fare
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# Map gender to numerical values
train_data['Sex'] = train_data['Sex'].map({"male": 0, "female": 1})
test_data['Sex'] = test_data['Sex'].map({"male": 0, "female": 1})

# Map titles to numerical values using the title_mapping dictionary
train_data['Title'] = train_data['Title'].map(title_mapping)
test_data['Title'] = test_data['Title'].map(title_mapping)

# Select relevant features
features = ["Pclass", "Sex", "FamilySize", "Fare", "Title"]

# Create dummy variables for categorical features
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# Impute missing values in X_test
imputer = SimpleImputer(strategy='mean')
X_test_imputed = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns)

# Build a random forest model
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

# Evaluate the model using cross-validation
cross_val_score(model, X, train_data["Survived"], cv=5, scoring="accuracy")

# Fit the model
model.fit(X, train_data["Survived"])

# Make predictions
predictions = model.predict(X_test_imputed)

# Save predictions to a CSV file
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!
