In [None]:
import sys
sys.path.insert(0, './src')

In [None]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from src.utils.metric_util import Evaluator
from src.config import Config
from src.utils.data_util import get_dataset
from mealpy.swarm_based.CSA import OriginalCSA
import pickle
from sklearn.model_selection import StratifiedKFold
import time
from matplotlib import pyplot as plt

In [None]:
rfc = RandomForestClassifier(random_state=2, n_estimators=46, max_depth=30, min_samples_leaf=13, n_jobs=-1, criterion='gini')

In [None]:
df = pd.read_csv('scaled.csv')
x = df.drop(['Arrhythmia'], axis=1)
y = df['Arrhythmia']

In [None]:
#amend position function
def amend_position(position, lower, upper):
    pos = np.clip(position, lower, upper).astype(int)
    if np.all((pos == 0)):
        pos[np.random.randint(0, len(pos))] = 1
    return pos

In [None]:
#fitness function
def fitness_function(solution):
    evaluator = Evaluator(x_train.values, x_test.values, y_train, y_test, solution, Config.CLASSIFIER, Config.DRAW_CONFUSION_MATRIX, Config.AVERAGE_METRIC)
    metrics = evaluator.get_metrics()
    if Config.PRINT_ALL:
        print(metrics)
    return list(metrics.values())

In [None]:
n_features = 43
LOWER_BOUND = [0, ] * n_features
UPPER_BOUND = [1.02, ] * n_features
problem_dict1 = {
    "fit_func": fitness_function,
    "lb": LOWER_BOUND,
    "ub": UPPER_BOUND,
 
    "minmax": Config.MIN_MAX_PROBLEM,
    "obj_weights": Config.OBJ_WEIGHTS,
    "amend_position": amend_position
}

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, stratify=y, test_size=0.3)

In [None]:
md_csa = OriginalCSA(epoch=100, pop_size=100, p_a=0.21)
best_position, best_fitness= md_csa.solve(problem=problem_dict1)

In [None]:
#def feature_name(features):
    #df1 = pd.read_csv('scaled.csv')
    #names = list(df1.columns.values)
    #feature = []
    #i = 0
    #for name in features:
        #if name == 1:
            #feature.append(names[i])
        #else:
            #pass
        #i+=1
    
    #return feature

In [None]:
feature_name(best_position)

In [None]:
selected = feature_name(best_position)
selected.append('Arrhythmia')
df_new = df[selected]
df_new.to_csv('pa21.csv', index=False)

In [None]:
#from mealpy.utils.visualize import *

In [None]:
# Draw global best fitness found so far in previous generations
export_convergence_chart(md_csa.history.list_global_best_fit, title='Global Best Fitness', x_label='epoch', y_label="Accuracy")

In [None]:
# Draw current best fitness in each previous generation
export_convergence_chart(md_csa.history.list_current_best_fit, title='Local Best Fitness', x_label='epoch', y_label="Accuracy")

In [None]:
# Draw runtime for each generation
export_convergence_chart(md_csa.history.list_epoch_time, title='Runtime', x_label='epoch', y_label="Second")

In [None]:
# This exploration/exploitation chart should draws for single algorithm and single fitness function
# Draw exploration and exploitation chart
export_explore_exploit_chart([md_csa.history.list_exploration, md_csa.history.list_exploitation], x_label='epoch')

In [None]:
df_fs = pd.read_csv('encoded_data.csv')
x_fs = np.array(df[['BBB', 'PR', 'RARL']])
y_fs = np.array(df['Arrhythmia'])
x_fs

In [None]:
strat_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
test_acc = []
train_acc = []
fit_time = []
score_time = []
p_score = []
r_score = []
f_score = []
conf_mat = []
roc_score = []
#class_report = []
i=0

for train_index, test_index in strat_kfold.split(x_fs, y_fs):
    x_train_fold, x_test_fold = x_fs[train_index], x_fs[test_index]
    y_train_fold, y_test_fold = y_fs[train_index], y_fs[test_index]
    
    start_fit = time.time() #record start time
    rfc.fit(x_train_fold, y_train_fold)
    end_fit = time.time() #record end time
    fit_time.append((end_fit-start_fit) * 10**3) #time in ms
    
    start_score = time.time() #record score time
    y_predict = rfc.predict(x_test_fold)
    end_score = time.time() #record end score time
    score_time.append((end_score-start_score) * 10**3) #time in ms
    
    #scorings
    accuracy_test = accuracy_score(y_test_fold, y_predict)
    test_acc.append(accuracy_test)
    
    f_score.append(f1_score(y_test_fold, y_predict, average='weighted')) #f1 score
    p_score.append(precision_score(y_test_fold, y_predict, average='weighted')) #precission score
    r_score.append(recall_score(y_test_fold, y_predict, average='weighted')) #recall score
    #class_report.append(classification_report(y_test_fold, y_predict)) #classification report
    conf_mat.append(confusion_matrix(y_test_fold, y_predict)) #confusion matrix
    roc_score.append(roc_auc_score(y_test_fold, y_predict)) #auc
    
    #roc curve
    plt.clf()
    plt.cla()
    fpr, tpr, _ = roc_curve(y_test_fold, y_predict)
    auc = roc_auc_score(y_test_fold, y_predict)
    plt.rcParams['figure.figsize'] = [11, 5]
    plt.plot(fpr, tpr, label="AUC="+str(auc), color = 'teal')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend()
    plt.style.use('fivethirtyeight')
    plt.tight_layout()
    plt.savefig(f'FS results/Statistical Results/roc_curve{i}')
    i+=1
    
    
    #train accuracy
    y_train_predict = rfc.predict(x_train_fold)
    accuracy_train = accuracy_score(y_train_fold, y_train_predict)
    train_acc.append(accuracy_test)

print(np.array(test_acc).mean())

In [None]:
print(np.array(fit_time).mean())

In [None]:
fit_time

In [None]:
test_acc

In [None]:
scorings = {'test_acc':test_acc,'train_acc':train_acc, 'fit_time': fit_time, 'score_time':score_time, 'f_score': f_score,
           'p_score':p_score, 'r_score':r_score, 'conf_mat': conf_mat, 'roc_score':roc_score}
df_scores = pd.DataFrame(scorings)
df_scores.to_csv('FS results/statistical_results.csv', index=False)

In [None]:
df_scores['test_acc'].mean()