In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Load the Dataset
file_path = '/content/Dataset.csv'
titanic_data = pd.read_csv(file_path)

In [3]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [2]:
# Step 2: Preprocess the Data
# Dropping non-essential features
titanic_data_cleaned = titanic_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# Handling missing values
titanic_data_cleaned['Age'].fillna(titanic_data_cleaned['Age'].median(), inplace=True)
titanic_data_cleaned['Fare'].fillna(titanic_data_cleaned['Fare'].median(), inplace=True)

# Encoding categorical variables
titanic_data_cleaned['Sex'] = titanic_data_cleaned['Sex'].map({'male': 0, 'female': 1})
titanic_data_cleaned = pd.get_dummies(titanic_data_cleaned, columns=['Embarked'], drop_first=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data_cleaned['Age'].fillna(titanic_data_cleaned['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data_cleaned['Fare'].fillna(titanic_data_cleaned['Fare'].median(), inplace=True)


In [4]:
# Step 3: Split Data into Features and Target
X = titanic_data_cleaned.drop(columns=['Survived'])
y = titanic_data_cleaned['Survived']

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Step 4: Train Random Forest Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 5: Make Predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

In [6]:
# Output results
print("Accuracy on Test Set:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", confusion_mat)

Accuracy on Test Set: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84


Confusion Matrix:
 [[50  0]
 [ 0 34]]


In [7]:
# Step 7: Cross-Validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
cv_mean = cv_scores.mean()
cv_std = cv_scores.std()

# Output Cross-Validation Results
print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_mean)
print("CV Accuracy Std Dev:", cv_std)


Cross-Validation Scores: [1. 1. 1. 1. 1.]
Mean CV Accuracy: 1.0
CV Accuracy Std Dev: 0.0
