# BioHEL vs HEROS Comparison

In [None]:
# imports
import pandas as pd
import numpy as np
import os
import pickle
import time
import subprocess
import re
import json
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows

# load HEROS
try:
    from src.skheros.heros import HEROS
    print("HEROS loaded")
except ImportError:
    try:
        from skheros.heros import HEROS
        print("HEROS loaded")
    except ImportError:
        print("HEROS not found")

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'biohel_data'
DATA_DIR.mkdir(exist_ok=True)

In [None]:
# dataset paths
DATASET_CONFIGS = {
    'MUX6': {
        'train_path': 'datasets/multiplexer/A_multiplexer_6_bit_500_inst_CV_Train_1.txt',
        'test_path': 'datasets/multiplexer/A_multiplexer_6_bit_500_inst_CV_Test_1.txt',
        'excluded_columns': ['Group', 'InstanceID', 'Class'],
        'description': '6-bit Multiplexer',
        'category': 'Multiplexer',
        'features': 6
    },
    'MUX11': {
        'train_path': 'datasets/multiplexer/B_multiplexer_11_bit_5000_inst_CV_Train_1.txt',
        'test_path': 'datasets/multiplexer/B_multiplexer_11_bit_5000_inst_CV_Test_1.txt',
        'excluded_columns': ['Group', 'InstanceID', 'Class'],
        'description': '11-bit Multiplexer',
        'category': 'Multiplexer',
        'features': 11
    },
    'MUX20': {
        'train_path': 'datasets/multiplexer/C_multiplexer_20_bit_10000_inst_CV_Train_1.txt',
        'test_path': 'datasets/multiplexer/C_multiplexer_20_bit_10000_inst_CV_Test_1.txt',
        'excluded_columns': ['Group', 'InstanceID', 'Class'],
        'description': '20-bit Multiplexer',
        'category': 'Multiplexer',
        'features': 20
    },
    'GAM_A': {
        'train_path': 'datasets/gametes/A_uni_4add_CV_Train_1.txt',
        'test_path': 'datasets/gametes/A_uni_4add_CV_Test_1.txt',
        'excluded_columns': ['Class'],
        'description': 'GAMETES 4 Additive Univariate',
        'category': 'GAMETES',
        'features': 100
    },
    'GAM_C': {
        'train_path': 'datasets/gametes/C_2way_epistasis_CV_Train_1.txt',
        'test_path': 'datasets/gametes/C_2way_epistasis_CV_Test_1.txt',
        'excluded_columns': ['Class'],
        'description': 'GAMETES 2-way Epistasis',
        'category': 'GAMETES',
        'features': 100
    },
    'GAM_E': {
        'train_path': 'datasets/gametes/E_uni_4het_CV_Train_1.txt',
        'test_path': 'datasets/gametes/E_uni_4het_CV_Test_1.txt',
        'excluded_columns': ['Model', 'InstanceID', 'Class'],
        'description': 'GAMETES 4 Heterogeneous Univariate',
        'category': 'GAMETES',
        'features': 100
    }
}

DATASETS = list(DATASET_CONFIGS.keys())

In [None]:
# helper functions

def load_dataset(config):
    train_df = pd.read_csv(config['train_path'], sep="\t")
    test_df = pd.read_csv(config['test_path'], sep="\t")
    feature_names = [col for col in train_df.columns if col not in config['excluded_columns']]
    X_train = train_df[feature_names].values
    y_train = train_df['Class'].values
    X_test = test_df[feature_names].values
    y_test = test_df['Class'].values
    row_id = train_df['InstanceID'].values if 'InstanceID' in train_df.columns else np.arange(len(train_df))
    return X_train, y_train, X_test, y_test, row_id, feature_names, train_df, test_df


def convert_to_arff(df, feature_names, filename, relation_name):
    with open(filename, 'w') as f:
        f.write(f"@RELATION {relation_name}\n\n")
        for feat in feature_names:
            unique_values = sorted(df[feat].unique())
            value_str = ','.join(map(str, [int(v) for v in unique_values]))
            f.write(f"@ATTRIBUTE {feat} {{{value_str}}}\n")
        class_values = sorted(df['Class'].unique())
        class_str = ','.join(map(str, [int(v) for v in class_values]))
        f.write(f"@ATTRIBUTE Class {{{class_str}}}\n\n")
        f.write("@DATA\n")
        for _, row in df.iterrows():
            values = [str(int(row[feat])) for feat in feature_names]
            values.append(str(int(row['Class'])))
            f.write(','.join(values) + '\n')


def create_biohel_config(output_path):
    config_content = """crossover operator 1px
default class major
fitness function mdl
initialization min classifiers 20
initialization max classifiers 20
iterations 50
mdl initial tl ratio 0.25
mdl iteration 10
mdl weight relax factor 0.90
pop size 500
prob crossover 0.6
prob individual mutation 0.6
prob one 0.75
selection algorithm tournamentwor
tournament size 4
windowing ilas 1
dump evolution stats
smart init
class wise init
coverage breakpoint 0.01
repetitions of rule learning 2
coverage ratio 0.90
kr hyperrect
num expressed attributes init 15
hyperrectangle uses list of attributes
prob generalize list 0.10
prob specialize list 0.10
expected number of attributes 10
random seed 42
"""
    with open(output_path, 'w') as f:
        f.write(config_content)

In [None]:
# HEROS training

def train_heros(X_train, y_train, row_id, cat_feat_indexes):
    heros = HEROS(
        outcome_type='class',
        iterations=50000,
        pop_size=500,
        model_iterations=100,
        model_pop_size=100,
        nu=1,
        beta=0.2,
        theta_sel=0.5,
        cross_prob=0.8,
        mut_prob=0.04,
        merge_prob=0.1,
        subsumption='both',
        compaction='sub',
        random_state=42,
        track_performance=1000,
        model_tracking=True,
        verbose=True
    )
    start_time = time.time()
    heros.fit(X_train, y_train, row_id, cat_feat_indexes=cat_feat_indexes)
    training_time = time.time() - start_time
    return heros, training_time


def evaluate_heros(heros, X_test, y_test, X_train, y_train):
    best_model_idx = heros.auto_select_top_model(X_test, y_test, verbose=False)
    predictions = heros.predict(X_test, whole_rule_pop=False, target_model=best_model_idx)
    train_predictions = heros.predict(X_train, whole_rule_pop=False, target_model=best_model_idx)
    rule_set = heros.get_model_rules(best_model_idx)
    return {
        'test_accuracy': np.mean(predictions == y_test),
        'train_accuracy': np.mean(train_predictions == y_train),
        'num_rules': len(rule_set),
        'best_model_idx': best_model_idx
    }

In [None]:
# run HEROS on all datasets
heros_results = {}

for dataset_name, config in DATASET_CONFIGS.items():
    print(f"\nProcessing {dataset_name}")
    
    dataset_dir = DATA_DIR / dataset_name
    dataset_dir.mkdir(exist_ok=True)
    
    X_train, y_train, X_test, y_test, row_id, feature_names, train_df, test_df = load_dataset(config)
    print(f"Train: {X_train.shape}, Test: {X_test.shape}")
    
    # create arff files for biohel
    convert_to_arff(train_df, feature_names, dataset_dir / 'train.arff', f"{dataset_name}_Train")
    convert_to_arff(test_df, feature_names, dataset_dir / 'test.arff', f"{dataset_name}_Test")
    create_biohel_config(dataset_dir / 'config.conf')
    
    # train
    cat_feat_indexes = list(range(X_train.shape[1]))
    heros_model, training_time = train_heros(X_train, y_train, row_id, cat_feat_indexes)
    
    # evaluate
    metrics = evaluate_heros(heros_model, X_test, y_test, X_train, y_train)
    metrics['training_time'] = training_time
    heros_results[dataset_name] = metrics
    
    # save
    with open(dataset_dir / 'heros_model.pickle', 'wb') as f:
        pickle.dump(heros_model, f)
    
    print(f"Accuracy: {metrics['test_accuracy']:.4f}, Rules: {metrics['num_rules']}, Time: {training_time:.1f}s")

print("\nHEROS done")

In [None]:
# biohel docker wrapper

class BioHELDocker:
    def __init__(self, base_dir, data_dir):
        self.base_dir = Path(base_dir)
        self.data_dir = Path(data_dir)
        self.docker_image = 'biohel'
    
    def check_docker(self):
        try:
            result = subprocess.run(['docker', 'info'], capture_output=True, text=True)
            return result.returncode == 0
        except FileNotFoundError:
            return False
    
    def build_image(self):
        print("Building docker image...")
        cmd = ['docker', 'build', '--platform', 'linux/amd64', '-t', self.docker_image, str(self.base_dir)]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Build failed: {result.stderr}")
            return False
        print("Image built")
        return True
    
    def image_exists(self):
        result = subprocess.run(['docker', 'images', '-q', self.docker_image], capture_output=True, text=True)
        return bool(result.stdout.strip())
    
    def run_biohel(self, dataset_name):
        dataset_dir = self.data_dir / dataset_name
        
        for f in ['config.conf', 'train.arff', 'test.arff']:
            if not (dataset_dir / f).exists():
                print(f"Missing {f}")
                return None
        
        print(f"Running BioHEL on {dataset_name}...")
        start_time = time.time()
        
        cmd = [
            'docker', 'run', '--platform', 'linux/amd64', '--rm',
            '-v', f'{dataset_dir.absolute()}:/data',
            self.docker_image,
            './biohel', '/data/config.conf', '/data/train.arff', '/data/test.arff'
        ]
        
        result = subprocess.run(cmd, capture_output=True, text=True)
        elapsed = time.time() - start_time
        
        if result.returncode != 0:
            print(f"Failed: {result.stderr}")
            return None
        
        with open(dataset_dir / 'biohel_output.txt', 'w') as f:
            f.write(result.stdout)
        
        return self.parse_output(result.stdout, elapsed)
    
    def parse_output(self, output, wall_time):
        results = {'wall_time': wall_time, 'test_accuracy': None, 'train_accuracy': None, 
                   'num_rules': 0, 'runtime': None, 'rules': []}
        
        for line in output.split('\n'):
            if 'Train accuracy :' in line:
                match = re.search(r'Train accuracy\s*:\s*([\d.]+)', line)
                if match: results['train_accuracy'] = float(match.group(1))
            if 'Test accuracy :' in line:
                match = re.search(r'Test accuracy\s*:\s*([\d.]+)', line)
                if match: results['test_accuracy'] = float(match.group(1))
            if 'Total time:' in line:
                match = re.search(r'Total time:\s*([\d.]+)', line)
                if match: results['runtime'] = float(match.group(1))
        
        # count rules
        in_phenotype = False
        for line in output.split('\n'):
            if line.startswith('Phenotype:'):
                in_phenotype = True
                continue
            if in_phenotype:
                if line.strip() and not line.startswith('Train'):
                    results['rules'].append(line.strip())
                    results['num_rules'] += 1
                else:
                    break
        
        return results

In [None]:
# setup biohel
biohel_runner = BioHELDocker(BASE_DIR, DATA_DIR)

if not biohel_runner.check_docker():
    print("Docker not running")
else:
    print("Docker running")
    if not biohel_runner.image_exists():
        biohel_runner.build_image()
    else:
        print("Image exists")

In [None]:
# run biohel on all datasets
biohel_results = {}

for dataset_name in DATASETS:
    print(f"\n{dataset_name}")
    result = biohel_runner.run_biohel(dataset_name)
    
    if result:
        biohel_results[dataset_name] = result
        print(f"Acc: {result['test_accuracy']:.4f}, Rules: {result['num_rules']}, Time: {result['runtime']:.1f}s")
    else:
        print("Failed")

print("\nBioHEL done")

In [None]:
# build comparison dataframe
comparison_rows = []

for dataset_name, config in DATASET_CONFIGS.items():
    if dataset_name in heros_results:
        h = heros_results[dataset_name]
        comparison_rows.append({
            'Dataset': dataset_name,
            'Category': config['category'],
            'Features': config['features'],
            'Algorithm': 'HEROS',
            'Test Accuracy': h['test_accuracy'],
            'Train Accuracy': h.get('train_accuracy'),
            'Num Rules': h['num_rules'],
            'Training Time (s)': h['training_time']
        })
    
    if dataset_name in biohel_results:
        b = biohel_results[dataset_name]
        comparison_rows.append({
            'Dataset': dataset_name,
            'Category': config['category'],
            'Features': config['features'],
            'Algorithm': 'BioHEL',
            'Test Accuracy': b['test_accuracy'],
            'Train Accuracy': b['train_accuracy'],
            'Num Rules': b['num_rules'],
            'Training Time (s)': b['runtime']
        })

comparison_df = pd.DataFrame(comparison_rows)
comparison_df

In [None]:
# summary table
print("\nComparison Results")
print("-" * 90)
print(f"{'Dataset':<10} {'Category':<12} {'HEROS Acc':<12} {'BioHEL Acc':<12} {'HEROS Rules':<12} {'BioHEL Rules':<12}")
print("-" * 90)

for dataset_name in DATASETS:
    config = DATASET_CONFIGS[dataset_name]
    h_row = comparison_df[(comparison_df['Dataset'] == dataset_name) & (comparison_df['Algorithm'] == 'HEROS')]
    b_row = comparison_df[(comparison_df['Dataset'] == dataset_name) & (comparison_df['Algorithm'] == 'BioHEL')]
    
    h_acc = f"{h_row['Test Accuracy'].values[0]:.1%}" if len(h_row) else "N/A"
    b_acc = f"{b_row['Test Accuracy'].values[0]:.1%}" if len(b_row) else "N/A"
    h_rules = int(h_row['Num Rules'].values[0]) if len(h_row) else "N/A"
    b_rules = int(b_row['Num Rules'].values[0]) if len(b_row) else "N/A"
    
    print(f"{dataset_name:<10} {config['category']:<12} {h_acc:<12} {b_acc:<12} {h_rules:<12} {b_rules:<12}")

In [None]:
# plots
plot_dir = DATA_DIR / 'comparison_plots'
plot_dir.mkdir(exist_ok=True)

colors = {'HEROS': '#4472C4', 'BioHEL': '#ED7D31'}

# accuracy plot
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(DATASETS))
width = 0.35

for i, algo in enumerate(['HEROS', 'BioHEL']):
    accs = [comparison_df[(comparison_df['Dataset']==d) & (comparison_df['Algorithm']==algo)]['Test Accuracy'].values[0]*100 
            if len(comparison_df[(comparison_df['Dataset']==d) & (comparison_df['Algorithm']==algo)]) else 0 
            for d in DATASETS]
    ax.bar(x + width*(i-0.5), accs, width, label=algo, color=colors[algo])

ax.set_xlabel('Dataset')
ax.set_ylabel('Test Accuracy (%)')
ax.set_title('HEROS vs BioHEL Accuracy')
ax.set_xticks(x)
ax.set_xticklabels(DATASETS)
ax.legend()
ax.set_ylim(0, 110)
plt.tight_layout()
plt.savefig(plot_dir / 'accuracy_comparison.png', dpi=300)
plt.show()

In [None]:
# rules plot
fig, ax = plt.subplots(figsize=(12, 6))

for i, algo in enumerate(['HEROS', 'BioHEL']):
    rules = [comparison_df[(comparison_df['Dataset']==d) & (comparison_df['Algorithm']==algo)]['Num Rules'].values[0] 
             if len(comparison_df[(comparison_df['Dataset']==d) & (comparison_df['Algorithm']==algo)]) else 0 
             for d in DATASETS]
    ax.bar(x + width*(i-0.5), rules, width, label=algo, color=colors[algo])

ax.set_xlabel('Dataset')
ax.set_ylabel('Number of Rules')
ax.set_title('HEROS vs BioHEL Rules')
ax.set_xticks(x)
ax.set_xticklabels(DATASETS)
ax.legend()
plt.tight_layout()
plt.savefig(plot_dir / 'rules_comparison.png', dpi=300)
plt.show()

In [None]:
# save to excel
excel_path = DATA_DIR / 'BIOHEL_HEROS_Comparison.xlsx'

with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    comparison_df.to_excel(writer, sheet_name='Comparison', index=False)
    
    pivot = comparison_df.pivot_table(
        index=['Dataset', 'Category', 'Features'],
        columns='Algorithm',
        values=['Test Accuracy', 'Num Rules', 'Training Time (s)'],
        aggfunc='first'
    )
    pivot.to_excel(writer, sheet_name='SideBySide')
    
    for dataset_name, results in biohel_results.items():
        if results.get('rules'):
            rules_df = pd.DataFrame({'Rule': results['rules']})
            rules_df.to_excel(writer, sheet_name=f'{dataset_name}_BioHEL', index=False)

print(f"Saved: {excel_path}")

# csv
csv_path = DATA_DIR / 'comparison.csv'
comparison_df.to_csv(csv_path, index=False)
print(f"Saved: {csv_path}")