In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import tkinter as tk
from tkinter import filedialog, messagebox
import os


In [3]:
# Step 1: Load the Dataset
data = pd.read_csv('train.csv')  # Replace with your dataset path
print("Dataset loaded successfully!")

# Preview the dataset
print(data.head())


Dataset loaded successfully!
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450

In [6]:
# Fill missing values for 'Age' with its median
data['Age'] = data['Age'].fillna(data['Age'].median())

# Fill missing values for 'Embarked' with its mode
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

# Fill missing values for 'Fare' with its median
data['Fare'] = data['Fare'].fillna(data['Fare'].median())

print("Missing values handled successfully!")


KeyError: 'Embarked'

In [7]:
# Drop columns that are not useful for the model
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

print("Irrelevant columns dropped!")


KeyError: "['PassengerId', 'Name', 'Ticket', 'Cabin'] not found in axis"

In [8]:
# Perform one-hot encoding for 'Sex' and 'Embarked'
data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

print("Categorical variables encoded successfully!")


KeyError: "None of [Index(['Sex', 'Embarked'], dtype='object')] are in the [columns]"

In [9]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Normalize 'Age' and 'Fare'
data[['Age', 'Fare']] = scaler.fit_transform(data[['Age', 'Fare']])

print("Numerical features normalized!")


Numerical features normalized!


In [10]:
# Define feature variables (X) and target variable (y)
X = data.drop('Survived', axis=1)  # Features
y = data['Survived']               # Target

print("Features and target variables prepared!")


Features and target variables prepared!


In [11]:
data.to_csv('processed_titanic_data.csv', index=False)
print("Processed data saved to 'processed_titanic_data.csv'.")


Processed data saved to 'processed_titanic_data.csv'.


In [12]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets successfully!")


Data split into training and testing sets successfully!


In [13]:
#TRAIN MODELS WITH DEFAULT PARAMETERS 
# LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)

# Train the model
logistic_model.fit(X_train, y_train)

# Make predictions
logistic_pred = logistic_model.predict(X_test)

# Evaluate the model
logistic_acc = accuracy_score(y_test, logistic_pred)
print(f"Logistic Regression Accuracy: {logistic_acc:.2f}")
print("Classification Report:")
print(classification_report(y_test, logistic_pred))


Logistic Regression Accuracy: 0.80
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [14]:
#DECISION TREE CLASSIFIER
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)

# Train the model
tree_model.fit(X_train, y_train)

# Make predictions
tree_pred = tree_model.predict(X_test)

# Evaluate the model
tree_acc = accuracy_score(y_test, tree_pred)
print(f"Decision Tree Accuracy: {tree_acc:.2f}")
print("Classification Report:")
print(classification_report(y_test, tree_pred))


Decision Tree Accuracy: 0.78
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       105
           1       0.73      0.76      0.74        74

    accuracy                           0.78       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.78      0.78      0.78       179



In [15]:
#PERFORMING HYPERPARAMETER TUNING
# GRID SEARCH FOR LOGISTIC REGRESSION
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
logistic_params = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

# Initialize Grid Search
grid_logistic = GridSearchCV(estimator=LogisticRegression(random_state=42),
                             param_grid=logistic_params,
                             scoring='accuracy',
                             cv=5)

# Perform the search
grid_logistic.fit(X_train, y_train)

# Best parameters and score
print("Best Logistic Regression Parameters:", grid_logistic.best_params_)
print("Best Logistic Regression Score:", grid_logistic.best_score_)




Best Logistic Regression Parameters: {'C': 1, 'solver': 'saga'}
Best Logistic Regression Score: 0.7977149610952428




In [16]:
#RANDOMIZED SEARCH FOR DECISION TREE
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter distribution
tree_params = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Randomized Search
random_tree = RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                                 param_distributions=tree_params,
                                 n_iter=10,
                                 scoring='accuracy',
                                 cv=5,
                                 random_state=42)

# Perform the search
random_tree.fit(X_train, y_train)

# Best parameters and score
print("Best Decision Tree Parameters:", random_tree.best_params_)
print("Best Decision Tree Score:", random_tree.best_score_)


Best Decision Tree Parameters: {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 3}
Best Decision Tree Score: 0.8230079779375554


In [17]:
#VALIDATE THE MODEL WITH CROSS-VALIDATION
from sklearn.model_selection import cross_val_score

# Perform cross-validation for the tuned Logistic Regression model
logistic_cv_scores = cross_val_score(grid_logistic.best_estimator_, X, y, cv=5)
print(f"Logistic Regression Cross-Validation Accuracy: {logistic_cv_scores.mean():.2f}")

# Perform cross-validation for the tuned Decision Tree model
tree_cv_scores = cross_val_score(random_tree.best_estimator_, X, y, cv=5)
print(f"Decision Tree Cross-Validation Accuracy: {tree_cv_scores.mean():.2f}")




Logistic Regression Cross-Validation Accuracy: 0.79
Decision Tree Cross-Validation Accuracy: 0.81


In [18]:
#
# Print a summary of model performances
print("Model Performance Summary:")
print(f"Logistic Regression - Default Accuracy: {logistic_acc:.2f}")
print(f"Decision Tree - Default Accuracy: {tree_acc:.2f}")
print(f"Logistic Regression - Tuned Accuracy: {grid_logistic.best_score_:.2f}")
print(f"Decision Tree - Tuned Accuracy: {random_tree.best_score_:.2f}")


Model Performance Summary:
Logistic Regression - Default Accuracy: 0.80
Decision Tree - Default Accuracy: 0.78
Logistic Regression - Tuned Accuracy: 0.80
Decision Tree - Tuned Accuracy: 0.82
