In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import random


In [2]:
# Load the Cleveland Heart Disease dataset
cleveland_data = pd.read_csv('Heart_disease_cleveland_new.csv', names=[
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak",
    "slope", "ca", "thal", "target"
], na_values='?')

# Load the Statlog (Heart) dataset from a local file
statlog_data = pd.read_csv('heart.dat', names=[
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak",
    "slope", "ca", "thal", "target"
], delim_whitespace=True)

# Adjust 'slope' and 'thal' values in Statlog dataset to match Cleveland dataset
statlog_data['slope'] = statlog_data['slope'].replace({1: 1, 2: 2, 3: 0})
statlog_data['thal'] = statlog_data['thal'].replace({3: 1, 6: 2, 7: 3})
statlog_data['target'] = statlog_data['target'].replace({1: 0, 2: 1})

# Combine the datasets
data = pd.concat([cleveland_data, statlog_data])

# Handle missing values by dropping rows with missing values
data.dropna(inplace=True)

# Check for any non-numeric values in 'target' column
data['target'] = pd.to_numeric(data['target'], errors='coerce')
data.dropna(subset=['target'], inplace=True)

# Convert target column to integer type
data['target'] = data['target'].astype(int)

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target'].apply(lambda x: 1 if x > 0 else 0)  # Convert target to binary

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


  statlog_data = pd.read_csv('heart.dat', names=[


In [3]:
# Define the objective function (accuracy)
def objective_function(params, X_train, y_train, X_test, y_test):
    criterion, max_depth, min_samples_split, min_samples_leaf = params
    dt_model = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42
    )
    dt_model.fit(X_train, y_train)
    y_pred = dt_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


In [4]:
class JellyfishOptimizationAlgorithm:
    def __init__(self, population_size, dimensions, lower_bound, upper_bound, max_iter, X_train, y_train, b=0.5, alpha=1.5):
        self.population_size = population_size
        self.dimensions = dimensions
        self.lower_bound = np.array(lower_bound)
        self.upper_bound = np.array(upper_bound)
        self.max_iter = max_iter
        self.b = b  # Distribution coefficient
        self.alpha = alpha  # Alpha controls the movement step size
        self.population = self.initialize_population()
        self.best_solution = None
        self.best_fitness = -1
        self.X_train = X_train
        self.y_train = y_train

    def initialize_population(self):
        return np.random.uniform(self.lower_bound, self.upper_bound, (self.population_size, self.dimensions))

    def fitness(self, solution):
        criterion = 'gini' if solution[0] < 0.5 else 'entropy'
        max_depth = int(solution[1])
        min_samples_split = int(solution[2])
        min_samples_leaf = int(solution[3])
        
        model = DecisionTreeClassifier(
            criterion=criterion,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf
        )
        model.fit(self.X_train, self.y_train)
        predictions = model.predict(self.X_train)
        accuracy = accuracy_score(self.y_train, predictions)
        return accuracy

    def ocean_current(self, individual, mean_location):
        r = random.random()
        trend = self.best_solution - self.b * mean_location * r
        new_position = individual + r * trend
        return np.clip(new_position, self.lower_bound, self.upper_bound)

    def passive_motion(self, individual):
        new_position = individual + (self.upper_bound - self.lower_bound) * np.random.random(self.dimensions)
        return np.clip(new_position, self.lower_bound, self.upper_bound)

    def active_motion(self, individual):
        r = random.random()
        new_position = individual + self.alpha * r * (self.best_solution - individual)
        return np.clip(new_position, self.lower_bound, self.upper_bound)

    def optimize(self):
        self.best_solution = None
        self.best_fitness = -1

        for iteration in range(self.max_iter):
            c_t = iteration / self.max_iter
            mean_location = np.mean(self.population, axis=0)

            for i in range(self.population_size):
                individual = self.population[i]
                fitness_value = self.fitness(individual)
                if fitness_value > self.best_fitness:
                    self.best_fitness = fitness_value
                    self.best_solution = individual

            new_population = np.zeros_like(self.population)
            for i in range(self.population_size):
                if c_t < 0.5:
                    new_population[i] = self.ocean_current(self.population[i], mean_location)
                else:
                    if 1 - c_t < random.random():
                        new_population[i] = self.passive_motion(self.population[i])
                    else:
                        new_population[i] = self.active_motion(self.population[i])

            self.population = new_population

            print(f"Iteration {iteration + 1}/{self.max_iter}, Best Fitness: {self.best_fitness}")

        return self.best_solution, self.best_fitness

# Example usage
param_bounds = {
    'criterion': (0, 1),  # 0 for 'gini', 1 for 'entropy'
    'max_depth': (1, 50),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 10)
}

In [5]:
lower_bound = [param_bounds['criterion'][0], param_bounds['max_depth'][0], param_bounds['min_samples_split'][0], param_bounds['min_samples_leaf'][0]]
upper_bound = [param_bounds['criterion'][1], param_bounds['max_depth'][1], param_bounds['min_samples_split'][1], param_bounds['min_samples_leaf'][1]]

jellyfish_optimizer = JellyfishOptimizationAlgorithm(
    population_size=30,
    dimensions=4,
    lower_bound=lower_bound,
    upper_bound=upper_bound,
    max_iter=50,
    X_train=X_train,
    y_train=y_train
)

best_solution, best_fitness = jellyfish_optimizer.optimize()
print(f"Best Solution: {best_solution}, Best Fitness: {best_fitness}")

# Train the Decision Tree model with the best parameters
best_criterion = 'gini' if best_solution[0] < 0.5 else 'entropy'
best_max_depth = int(best_solution[1])
best_min_samples_split = int(best_solution[2])
best_min_samples_leaf = int(best_solution[3])

best_model = DecisionTreeClassifier(
    criterion=best_criterion,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
    random_state=42
)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Print the evaluation metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Append the results to the CSV file
results = pd.DataFrame({
    'Model': ['Decision Tree'],
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1-Score': [f1]
})

results.to_csv('model_results.csv', mode='a', header=False, index=False)

Iteration 1/50, Best Fitness: 0.9585152838427947
Iteration 2/50, Best Fitness: 0.9759825327510917
Iteration 3/50, Best Fitness: 0.980349344978166
Iteration 4/50, Best Fitness: 0.980349344978166
Iteration 5/50, Best Fitness: 0.980349344978166
Iteration 6/50, Best Fitness: 0.980349344978166
Iteration 7/50, Best Fitness: 0.980349344978166
Iteration 8/50, Best Fitness: 0.980349344978166
Iteration 9/50, Best Fitness: 0.980349344978166
Iteration 10/50, Best Fitness: 0.9912663755458515
Iteration 11/50, Best Fitness: 0.9934497816593887
Iteration 12/50, Best Fitness: 1.0
Iteration 13/50, Best Fitness: 1.0
Iteration 14/50, Best Fitness: 1.0
Iteration 15/50, Best Fitness: 1.0
Iteration 16/50, Best Fitness: 1.0
Iteration 17/50, Best Fitness: 1.0
Iteration 18/50, Best Fitness: 1.0
Iteration 19/50, Best Fitness: 1.0
Iteration 20/50, Best Fitness: 1.0
Iteration 21/50, Best Fitness: 1.0
Iteration 22/50, Best Fitness: 1.0
Iteration 23/50, Best Fitness: 1.0
Iteration 24/50, Best Fitness: 1.0
Iteration 2