In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score

In [None]:
df = pd.read_csv('encoded_data.csv')
x = np.array(df.drop(['Arrhythmia'], axis=1))
y = np.array(df['Arrhythmia'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

# Bayesian Optimization

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold

In [None]:
def stratified_kfold_score(clf, x, y, n_fold):
    x, y = x, y
    strat_kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=1)
    accuracy_list = []

    for train_index, test_index in strat_kfold.split(x, y):
        x_train_fold, x_test_fold = x[train_index], x[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        preds = clf.predict(x_test_fold)
        accuracy_test = accuracy_score(preds, y_test_fold)
        accuracy_list.append(accuracy_test)

    return np.array(accuracy_list).mean()

In [None]:
def bo_params_rf(max_samples, n_estimators, max_features):
    
    params = {
        'max_samples': max_samples,
        'max_features': max_features,
        'n_estimators': int(n_estimators)
    }
    clf = RandomForestClassifier(max_samples=params['max_samples'], max_features=params['max_features'], n_estimators=params['n_estimators'])
    score = stratified_kfold_score(clf, x_train, y_train, 10)
    return score

In [None]:
rf_bo = BayesianOptimization(bo_params_rf, {
                                              'max_samples':(0.5,1),
                                                'max_features':(0.5,1),
                                              'n_estimators':(1, 100)
                                             })

In [None]:
results = rf_bo.maximize(n_iter=100, init_points=20, acq='ei')

In [None]:
rf_bo.max['params']

In [None]:
params = rf_bo.max['params']
params['n_estimators'] = int(params['n_estimators'])

In [None]:
clf = RandomForestClassifier(max_samples=params['max_samples'], max_features=params['max_features'], n_estimators=params['n_estimators'], n_jobs=-1)
clf.fit(x_train, y_train)
preds = clf.predict(x_test)
accuracy_score(preds, y_test)

In [None]:
accuracy_score(y_test, preds)

In [None]:
precision_score(y_test, preds)

In [None]:
recall_score(y_test, preds)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
roc_auc_score(y_test, preds)

In [None]:
from matplotlib import pyplot as plt

In [None]:
fpr, tpr, _ = roc_curve(y_test, preds)
auc = roc_auc_score(y_test, preds)
plt.rcParams['figure.figsize'] = [11, 5]
plt.plot(fpr, tpr, label="AUC="+str(auc), color = 'teal')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()
plt.style.use('fivethirtyeight')
plt.tight_layout()

# Cuckoo Search

In [None]:
from mealpy.swarm_based.CSA import OriginalCSA
from math import floor
from mealpy.tuner import Tuner

In [None]:
def fitness_function(params):
    rfc = RandomForestClassifier(
        random_state=2,
        max_depth=floor(params[0]),
        n_estimators=floor(params[1]),
        min_samples_leaf=floor(params[2]),
        max_features=(floor(params[3]))
    )
    rfc.fit(x_train, y_train)
    pred = rfc.predict(x_test)
    acc = accuracy_score(y_test, pred)
    
    return acc

In [None]:
df.shape

In [None]:
problem_dict = {
    'fit_func' : fitness_function,
    'lb' : [1, 1, 1, 1],
    'ub' : [100, 100, 100, 43],
    'minmax' : 'max'
}

epoch = 100
pop_size = 100
p_a = 0.25
model = OriginalCSA(epoch, pop_size, p_a)
best_position, best_fitness = model.solve(problem_dict)
print(f'Best position: {best_position}, Best Fitness: {best_fitness}')

# CSA CV 

In [None]:
def stratified_kfold_score(clf, x, y, n_fold):
    x, y = x, y
    strat_kfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=1)
    accuracy_list = []

    for train_index, test_index in strat_kfold.split(x, y):
        x_train_fold, x_test_fold = x[train_index], x[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        preds = clf.predict(x_test_fold)
        accuracy_test = accuracy_score(preds, y_test_fold)
        accuracy_list.append(accuracy_test)

    return np.array(accuracy_list).mean()

In [None]:
def fitness_function(params):
    clf = RandomForestClassifier(
        random_state=2,
        max_depth=floor(params[0]),
        n_estimators=floor(params[1]),
        min_samples_leaf=floor(params[2]),
        max_features=(floor(params[3])),
        n_jobs=-1,
        criterion='gini'
    )
    score = stratified_kfold_score(clf, x_train, y_train, 10)
    return score

In [None]:
problem_dict1 = {
    'fit_func' : fitness_function,
    'lb' : [1, 1, 1, 1],
    'ub' : [100, 500, 200, 43],
    'minmax' : 'max'
}

epoch = 100
pop_size = 100
p_a = 0.25
model = OriginalCSA(epoch, pop_size, p_a)
best_position, best_fitness = model.solve(problem_dict1)
print(f'Best position: {best_position}, Best Fitness: {best_fitness}')

In [None]:
for _ in best_position:
    print(floor(_))

In [None]:
for _ in best_position:
    print(floor(_))

In [None]:
from mealpy.utils.visualize import *

In [None]:
# Draw global best fitness found so far in previous generations
export_convergence_chart(model.history.list_global_best_fit, title='Global Best Fitness', x_label='epoch', y_label="Accuracy")

In [None]:
# Draw current best fitness in each previous generation
export_convergence_chart(model.history.list_current_best_fit, title='Local Best Fitness', x_label='epoch', y_label="Accuracy")

In [None]:
# Draw runtime for each generation
export_convergence_chart(model.history.list_epoch_time, title='Runtime', x_label='epoch', y_label="Second")

In [None]:
# This exploration/exploitation chart should draws for single algorithm and single fitness function
# Draw exploration and exploitation chart
export_explore_exploit_chart([model.history.list_exploration, model.history.list_exploitation], x_label='epoch')

In [None]:
global_obj_list = np.array([agent[1][1] for agent in model.history.list_global_best])     # 2D array / matrix 2D
global_obj_list = [global_obj_list[:,idx] for idx in range(0, len(global_obj_list[0]))]     # Make each obj_list as a element in array for drawing
export_objectives_chart(global_obj_list, title='Global Objectives Chart')

In [None]:
current_obj_list = np.array([agent[1][1] for agent in model.history.list_current_best])  # 2D array / matrix 2D
current_obj_list = [current_obj_list[:, idx] for idx in range(0, len(current_obj_list[0]))]  # Make each obj_list as a element in array for drawing
export_objectives_chart(current_obj_list, title='Local Objectives Chart')

In [None]:
rfc = RandomForestClassifier(n_estimators=46, max_depth=30, min_samples_leaf=13, max_features=21, n_jobs=-1, criterion='gini')

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score

In [None]:
df = pd.read_csv('encoded_data.csv')
x = np.array(df.drop(['Arrhythmia'], axis=1))
y = np.array(df['Arrhythmia'])

In [None]:
scores = cross_validate(rfc, x, y, scoring='precision_macro', cv=10, return_train_score=True)
scores

In [None]:
df_scores = pd.DataFrame(scores)
df_scores

In [None]:
df_scores.to_csv('CV_scores_all_features.csv', index=False)

In [None]:
scores['test_score'].mean()

In [None]:
best_acc = df_scores['test_score'].mean()

In [None]:
best_acc

In [None]:
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve, f1_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from matplotlib import pyplot as plt

In [None]:
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
test_acc = []
train_acc = []
fit_time = []
score_time = []
p_score = []
r_score = []
f_score = []
conf_mat = []
roc_score = []
#class_report = []
i=0

for train_index, test_index in strat_kfold.split(x, y):
    x_train_fold, x_test_fold = x[train_index], x[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    start_fit = time.time() #record start time
    rfc.fit(x_train_fold, y_train_fold)
    end_fit = time.time() #record end time
    fit_time.append((end_fit-start_fit) * 10**3) #time in ms
    
    start_score = time.time() #record score time
    y_predict = rfc.predict(x_test_fold)
    end_score = time.time() #record end score time
    score_time.append((end_score-start_score) * 10**3) #time in ms
    
    #scorings
    accuracy_test = accuracy_score(y_test_fold, y_predict)
    test_acc.append(accuracy_test)
    
    f_score.append(f1_score(y_test_fold, y_predict, average='weighted')) #f1 score
    p_score.append(precision_score(y_test_fold, y_predict, average='weighted')) #precission score
    r_score.append(recall_score(y_test_fold, y_predict, average='weighted')) #recall score
    #class_report.append(classification_report(y_test_fold, y_predict)) #classification report
    conf_mat.append(confusion_matrix(y_test_fold, y_predict)) #confusion matrix
    roc_score.append(roc_auc_score(y_test_fold, y_predict)) #auc
    
    #roc curve
    plt.clf()
    plt.cla()
    fpr, tpr, _ = roc_curve(y_test_fold, y_predict)
    auc = roc_auc_score(y_test_fold, y_predict)
    plt.rcParams['figure.figsize'] = [11, 5]
    plt.plot(fpr, tpr, label="AUC="+str(auc), color = 'teal')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend()
    plt.style.use('fivethirtyeight')
    plt.tight_layout()
    plt.savefig(f'Graphs/roc_curve{i}.png')
    i+=1
    
    
    #train accuracy
    y_train_predict = rfc.predict(x_train_fold)
    accuracy_train = accuracy_score(y_train_fold, y_train_predict)
    train_acc.append(accuracy_test)

print(np.array(test_acc).mean())

In [None]:
test_acc

In [None]:
scorings = {'test_acc':test_acc,'train_acc':train_acc, 'fit_time': fit_time, 'score_time':score_time, 'f_score': f_score,
           'p_score':p_score, 'r_score':r_score, 'conf_mat': conf_mat, 'roc_score':roc_score}
scorings

In [None]:
df_scores = pd.DataFrame(scorings)
df_scores

In [None]:
df_scores.to_csv('Graphs/All_features_results.csv', index=False)

In [None]:
something = df_scores['test_acc']
np.array(something).mean()