<a href="https://colab.research.google.com/github/ravitejaadapa12/python-assignment/blob/main/ml_titanic_dataset_assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [3]:
# Load the dataset
file_path ='/content/titanic_dataset-1 (1) (1).csv'
titanic_data = pd.read_csv(file_path)

In [4]:
# Feature Engineering
# Fill missing 'Age' with median
age_imputer = SimpleImputer(strategy='median')
titanic_data['Age'] = age_imputer.fit_transform(titanic_data[['Age']])

In [5]:
# Fill missing 'Embarked' with the most frequent value
embarked_imputer = SimpleImputer(strategy='most_frequent')

# The following line is modified to extract the first column (index 0) of the transformed array
titanic_data['Embarked'] = embarked_imputer.fit_transform(titanic_data[['Embarked']])[:, 0]

# Drop irrelevant columns
titanic_data.drop(columns=['Cabin', 'Name', 'Ticket', 'PassengerId'], inplace=True)

# Create a new feature 'FamilySize'
titanic_data['FamilySize'] = titanic_data['SibSp'] + titanic_data['Parch']

# Encode categorical variables
titanic_data['Sex'] = LabelEncoder().fit_transform(titanic_data['Sex'])
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'], drop_first=True)

In [6]:
# Scaling numeric features
scaler = StandardScaler()
numeric_features = ['Age', 'Fare', 'FamilySize']
titanic_data[numeric_features] = scaler.fit_transform(titanic_data[numeric_features])

# Split dataset into features and target
X = titanic_data.drop(columns=['Survived'])
y = titanic_data['Survived']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

In [7]:
# Hyperparameter tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=100,
    cv=StratifiedKFold(n_splits=5),
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=2
)

In [9]:
# Fit the model
random_search.fit(X_train, y_train)

# Best parameters and model
best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_accuracy = random_search.best_score_

# Predict on test set
y_pred = best_model.predict(X_test)

# Final accuracy and classification report
test_accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


160 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [10]:
print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_accuracy)
print("Test Accuracy:", test_accuracy)
print("Classification Report:\n", report)

Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10}
Best Cross-Validation Accuracy: 0.8287402738106963
Test Accuracy: 0.8100558659217877
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.90      0.85       110
           1       0.81      0.67      0.73        69

    accuracy                           0.81       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.81      0.81       179

