In [None]:

import os
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix


def calculate_entropy_features(file_bytes):
    """
    Calculate Shannon and Rényi entropy from byte data.
    """
    byte_counts = [0] * 256
    for b in file_bytes:
        byte_counts[b] += 1
    total_bytes = len(file_bytes)
    probabilities = [count / total_bytes for count in byte_counts if count > 0]

    # Shannon Entropy (F1)
    shannon_entropy = -sum(p * math.log2(p) for p in probabilities)

    # Rényi Entropy (F2) with alpha = 2
    alpha = 2
    renyi_entropy = 1 / (1 - alpha) * math.log2(sum(p ** alpha for p in probabilities))

    return shannon_entropy, renyi_entropy


def preprocess_data(df):
    """
    Preprocess dataset by removing entropy outliers (IQR method) and normalizing features.
    """
    for col in ['shannon', 'renyi']:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        df = df[(df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 * IQR)]

    df[['shannon', 'renyi']] = (df[['shannon', 'renyi']] - df[['shannon', 'renyi']].min()) / \
                               (df[['shannon', 'renyi']].max() - df[['shannon', 'renyi']].min())
    return df


def evaluate_models(df):
    """
    Evaluate ML models using Shannon and Rényi entropy with 5-fold stratified CV.
    Returns confusion matrix results and passing rate analysis.
    """
    results_confusion = []
    results_passing = []

    y = df['label'].values
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    models = {
        'NB': GaussianNB(),
        'RF': RandomForestClassifier(n_estimators=100, random_state=42),
        'DT': DecisionTreeClassifier(random_state=42)
    }

    for entropy_name, feature in [('F1', 'shannon'), ('F2', 'renyi')]:
        X = df[[feature]].values

        for model_name, model in models.items():
            tp_total = tn_total = fp_total = fn_total = 0

            for train_idx, test_idx in skf.split(X, y):
                X_train, X_test = X[train_idx], X[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
                if cm.shape == (2, 2):
                    tn, fp, fn, tp = cm.ravel()
                else:
                    tn = fp = fn = tp = 0

                tp_total += tp
                tn_total += tn
                fp_total += fp
                fn_total += fn

            results_confusion.append({
                'Entropy': entropy_name,
                'Model': model_name,
                'TP': tp_total,
                'TN': tn_total,
                'FP': fp_total,
                'FN': fn_total
            })


            total_decisions = tp_total + tn_total + fp_total + fn_total
            true_accepts = tp_total + tn_total
            passing_rate = (true_accepts / total_decisions) * 100

            results_passing.append({
                'Feature + Model Configuration': f"{entropy_name} + {model_name}",
                'TD': total_decisions,
                'TA': true_accepts,
                'Passing Rate': f"{passing_rate:.2f}%"
            })

    df_confusion = pd.DataFrame(results_confusion)
    df_passing = pd.DataFrame(results_passing)
    return df_confusion, df_passing

# ------------------------ Main Execution ------------------------
if __name__ == "__main__":
    # STEP 1: Load CSV (user must ensure shannon, renyi, label exist)
    df = pd.read_csv("your_entropy_dataset.csv")  # Replace with real file path

    # STEP 2: Preprocess entropy values
    df = preprocess_data(df)

    # STEP 3: Evaluate classifiers
    confusion_df, passing_df = evaluate_models(df)

    # STEP 4: Print formatted results
    print("=== Confusion Matrix Results ===\n")
    print(confusion_df.to_string(index=False))

    print("\n=== Passing Rate Analysis ===\n")
    print(passing_df.to_string(index=False))

    # STEP 5 (optional): Save to CSV for manuscript integration
    confusion_df.to_csv("confusion_matrix_results.csv", index=False)
    passing_df.to_csv("passing_rate_results.csv", index=False)

    # Note: Our test suite generation for edge-case files was performed by mapping
    # actual predictions to expected outcomes using the confusion matrix based on our methodology.

    # Note: This code represents experiments for document-based files.
    # Identical methodology was applied independently for image-based, compressed,
    # and other file formats using their respective entropy features and labels.
