# TASK : 1 (TITANIC_SURVIVAL_PREDICTION)

In [1]:
# Load and Clean the Data, and importing all the libraries.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix
# Loading the dataset
titanic_data = pd.read_csv('C:/Users/91707/OneDrive/Desktop/CodSoft/Tasks/TITANIC_SURVIVAL_PREDICTION/Titanic-Dataset.csv')
# Filling missing values
titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)
# Droping unnecessary columns
titanic_data.drop(columns=['PassengerId', 'Cabin', 'Ticket', 'Name'], inplace=True)
# Converting categorical variables into numerical
titanic_data = pd.get_dummies(titanic_data, columns=['Sex', 'Embarked'], drop_first=True)

In [2]:
# Split the Data
# Defining features and target variable
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']
# Spliting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
# Training and Fine-Tuneing the Random Forest Model
# Initializing the Random Forest model
rf = RandomForestClassifier(random_state=42)
# Defining hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# Initializing GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
# Training the model
grid_search.fit(X_train, y_train)
# Geting the best model
best_rf = grid_search.best_estimator_
# Displaying the best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}


In [5]:
# Makeing Predictions and Evaluate the Model:
# Makeing predictions
y_pred = best_rf.predict(X_test)
# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Display the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8156424581005587
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       105
           1       0.83      0.70      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.80       179
weighted avg       0.82      0.82      0.81       179



# Printing first few predictions and actual values


In [6]:
print("First few predictions values: ", y_pred[:10])
print("First few actual values: ", y_test.values[:10])

First few predictions values:  [0 0 0 1 1 1 1 0 1 1]
First few actual values:  [1 0 0 1 1 1 1 0 1 1]
