In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
import random

Loading the Dataset

In [18]:
# Load the Cleveland Heart Disease dataset
cleveland_data = pd.read_csv('Heart_disease_cleveland_new.csv', names=[
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak",
    "slope", "ca", "thal", "target"
], na_values='?')

# Load the Statlog (Heart) dataset
statlog_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat"
statlog_column_names = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak",
    "slope", "ca", "thal", "target"
]
statlog_data = pd.read_csv(statlog_url, names=statlog_column_names, delim_whitespace=True)
# Adjust 'slope' and 'thal' values in Statlog dataset to match Cleveland dataset
statlog_data['slope'] = statlog_data['slope'].replace({1: 1, 2: 2, 3: 0})
statlog_data['thal'] = statlog_data['thal'].replace({3: 1, 6: 2, 7: 3})
statlog_data['target'] = statlog_data['target'].replace({1: 0, 2: 1})
print(statlog_data.head())

# Combine the datasets
data = pd.concat([cleveland_data, statlog_data])
# Display the rows where the target value is 2
# target_2_data = cleveland_data[cleveland_data['target'] == 2]
# print(target_2_data.head())
# print(len(target_2_data.index))
# Display the first few rows of the combined dataset
print(data.head())
print(len(data.index))


  statlog_data = pd.read_csv(statlog_url, names=statlog_column_names, delim_whitespace=True)


    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  70.0  1.0  4.0     130.0  322.0  0.0      2.0    109.0    0.0      2.4   
1  67.0  0.0  3.0     115.0  564.0  0.0      2.0    160.0    0.0      1.6   
2  57.0  1.0  2.0     124.0  261.0  0.0      0.0    141.0    0.0      0.3   
3  64.0  1.0  4.0     128.0  263.0  0.0      0.0    105.0    1.0      0.2   
4  74.0  0.0  2.0     120.0  269.0  0.0      2.0    121.0    1.0      0.2   

   slope   ca  thal  target  
0    2.0  3.0   1.0       1  
1    2.0  0.0   3.0       0  
2    1.0  0.0   3.0       1  
3    2.0  1.0   3.0       0  
4    1.0  1.0   1.0       0  
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0  age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope   
1   63    1   0       145   233    1        2      150      0      2.3      2   
2   67    1   3       160   286    0        2      108      1      1.5      1   
3   67    1   3       120   229  

Data Pre-Processing

In [19]:
data['target'] = pd.to_numeric(data['target'], errors='coerce')
# Handle missing values by dropping rows with missing values
data.dropna(inplace=True)

# Convert target column to integer type
data['target'] = data['target'].astype(int)

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target'].apply(lambda x: 1 if x > 0 else 0)  # Convert target to binary

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


JellyFish Optimization Algorithm

In [20]:
class JellyfishOptimizationAlgorithm:
    def __init__(self, population_size, dimensions, lower_bound, upper_bound, max_iter, b=0.5, alpha=1.5):
        self.population_size = population_size
        self.dimensions = dimensions
        self.lower_bound = np.array(lower_bound)
        self.upper_bound = np.array(upper_bound)
        self.max_iter = max_iter
        self.b = b  # Distribution coefficient
        self.alpha = alpha  # Alpha controls the movement step size
        self.population = self.initialize_population()
        self.best_solution = None
        self.best_fitness = -1

    def initialize_population(self):
        return np.random.uniform(self.lower_bound, self.upper_bound, (self.population_size, self.dimensions))

    def ocean_current(self, individual, mean_location):
        r = random.random()
        trend = self.best_solution - self.b * mean_location * r
        new_position = individual + r * trend
        return np.clip(new_position, self.lower_bound, self.upper_bound)

    def passive_motion(self, individual):
        new_position = individual + (self.upper_bound - self.lower_bound) * np.random.random(self.dimensions)
        return np.clip(new_position, self.lower_bound, self.upper_bound)

    def active_motion(self, individual):
        r = random.random()
        new_position = individual + self.alpha * r * (self.best_solution - individual)
        return np.clip(new_position, self.lower_bound, self.upper_bound)

    def optimize(self, fitness_function):
        self.best_solution = None
        self.best_fitness = -1

        for iteration in range(self.max_iter):
            c_t = iteration / self.max_iter
            mean_location = np.mean(self.population, axis=0)

            for i in range(self.population_size):
                individual = self.population[i]
                fitness_value = fitness_function(individual)
                if fitness_value > self.best_fitness:
                    self.best_fitness = fitness_value
                    self.best_solution = individual

            new_population = np.zeros_like(self.population)
            for i in range(self.population_size):
                if c_t < 0.5:
                    new_population[i] = self.ocean_current(self.population[i], mean_location)
                else:
                    if 1 - c_t < random.random():
                        new_population[i] = self.passive_motion(self.population[i])
                    else:
                        new_population[i] = self.active_motion(self.population[i])

            self.population = new_population

            # print(f"Iteration {iteration + 1}/{self.max_iter}, Best Fitness: {self.best_fitness}")

        return self.best_solution, self.best_fitness

Logistic Regression Model

In [21]:
# Define the bounds for C parameter of the Logistic Regression
lower_bound = [0.001]
upper_bound = [100]

# Instantiate the Jellyfish Optimization Algorithm
joa = JellyfishOptimizationAlgorithm(
    population_size=30, dimensions=1, lower_bound=lower_bound, upper_bound=upper_bound, max_iter=50
)

def logistic_regression_fitness(solution):
    C = solution[0]
    model = LogisticRegression(C=C, solver='liblinear')
    model.fit(X_train, y_train)
    predictions = model.predict(X_train)
    accuracy = accuracy_score(y_train, predictions)
    return accuracy

# Optimize the Logistic Regression parameters using JOA
best_solution, best_fitness = joa.optimize(logistic_regression_fitness)
print(f"Best solution (C): {best_solution[0]}, Best fitness: {best_fitness}")

# Train the Logistic Regression model with the best parameter
model = LogisticRegression(C=best_solution[0], solver='liblinear')
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")

Best solution (C): 29.167800629847573, Best fitness: 0.8646288209606987
Accuracy: 0.8434782608695652
Precision: 0.8636363636363636
Recall: 0.76
F1 Score: 0.8085106382978723
ROC AUC Score: 0.8338461538461539
