In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
os.listdir('/kaggle/input')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

['titanic-dataset', 'd', 'titanic']

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load the preprocessed DataFrame (X is features, y is the target variable)
df = pd.read_csv("/kaggle/input/titanic/train.csv")

# Drop unnecessary columns
dropped_columns = ["PassengerId", "Name", "Ticket", "Cabin", "Embarked"]
df.drop(dropped_columns, inplace=True, axis=1)

# Map 'Sex' to 0 and 1
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Separate features and target variable
X = df.drop("Survived", axis=1)  # Features
y = df["Survived"]  # Target variable

# Fill missing values in the 'Age' column with the mean age
mean_age = X.loc[X['Age'].notnull(), 'Age'].mean()
X['Age'] = X['Age'].fillna(mean_age)

# Apply RobustScaler to features
robust_scaler = RobustScaler()
X_scaled = robust_scaler.fit_transform(X)

# Apply MinMaxScaler to features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=5, random_state=42)

# Fit the model on the training set
rf_classifier.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf_classifier.predict(X_train)

# Generate a classification report for the training set
train_report = classification_report(y_train, y_train_pred)
print("Training Set Classification Report:\n", train_report)

# Load the test dataset provided by Kaggle
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")

# Drop unnecessary columns from the test dataset
dropped_columns_test = ["Name", "Ticket", "Cabin", "Embarked"]
test_df.drop(dropped_columns_test, inplace=True, axis=1)

# Map 'Sex' to 0 and 1
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})

# Fill missing values in the 'Age' column with the mean age
mean_age_test = test_df['Age'].mean()
test_df['Age'] = test_df['Age'].fillna(mean_age_test)

# Separate features from the test dataset
X_test_final = test_df.drop("PassengerId", axis=1)  # Features

# Fill missing values in the test set with mean values
X_test_final = X_test_final.fillna(X_test_final.mean())

# Apply RobustScaler to features
X_test_final_scaled = robust_scaler.transform(X_test_final)

# Apply MinMaxScaler to features
X_test_final_scaled = scaler.transform(X_test_final_scaled)

# Make predictions on the final test set
final_test_predictions = rf_classifier.predict(X_test_final_scaled)

# Create a submission DataFrame with 'PassengerId' and 'Survived'
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': final_test_predictions})
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

Training Set Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95       444
           1       0.95      0.90      0.92       268

    accuracy                           0.94       712
   macro avg       0.94      0.93      0.94       712
weighted avg       0.94      0.94      0.94       712

