In [None]:
pip install pandas openpyxl scikit-learn xgboost lightgbm shap matplotlib seaborn scikit-posthocs joblib

In [4]:

# ==============================================================================
# کد 1: اسکریپت اصلی تحلیل دیتاست پسته
# ==============================================================================

# Step 0: Setup and Initial Loading
# ==============================================================================

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import joblib
import shap
from scipy.stats import shapiro, f_oneway, kruskal
import scikit_posthocs as sp

# Machine Learning models and metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    cohen_kappa_score, matthews_corrcoef, confusion_matrix,
    roc_curve, auc, roc_auc_score,
    precision_recall_curve, average_precision_score
)
from sklearn.preprocessing import label_binarize

# --- Configuration ---
sns.set_theme(style="whitegrid")

# --- Define Paths ---
BASE_PATH = '/home/sajad/Sajad_test/Pistachio Dataset/'
DATA_PATH = os.path.join(BASE_PATH, 'dataset/Pistachio_28_Features_Dataset.xlsx')
EDA_PATH = os.path.join(BASE_PATH, 'EDA')
STATS_PATH = os.path.join(BASE_PATH, 'Statistical_Analysis')
LR_PATH = os.path.join(BASE_PATH, 'Logistic_Regression')
RF_PATH = os.path.join(BASE_PATH, 'Random_Forest')
XGB_PATH = os.path.join(BASE_PATH, 'XGBoost')
LGBM_PATH = os.path.join(BASE_PATH, 'LightGBM')
COMPARE_PATH = os.path.join(BASE_PATH, 'compairation')
DATA_RESULTS_PATH = os.path.join(BASE_PATH, 'data_results')

# Create directories
# SVM_PATH has been removed
paths = [EDA_PATH, STATS_PATH, LR_PATH, RF_PATH, XGB_PATH, LGBM_PATH, COMPARE_PATH, DATA_RESULTS_PATH]
for path in paths:
    os.makedirs(path, exist_ok=True)
print("All directories are created or already exist.")

# --- Helper Function for Saving Plots ---
def save_plot(fig, path, filename, width_cm=13):
    """Saves a matplotlib figure in multiple formats with specified width."""
    # Convert width from cm to inches
    width_in = width_cm / 2.54
    # Adjust height to maintain aspect ratio
    fig.set_size_inches(width_in, fig.get_figheight() * (width_in / fig.get_figwidth()))
    # Save the figure
    fig.savefig(os.path.join(path, f"{filename}.svg"), format='svg', bbox_inches='tight')
    fig.savefig(os.path.join(path, f"{filename}.pdf"), format='pdf', bbox_inches='tight')
    plt.close(fig)
    print(f"Plot saved: {filename}.svg and {filename}.pdf in {path}")

# --- Load and Prepare Data ---
try:
    # Changed to read_excel for .xlsx files
    df = pd.read_excel(DATA_PATH)
except FileNotFoundError:
    print(f"Error: The file was not found at {DATA_PATH}")
    exit()

def clean_col_names(df):
    """Cleans column names to be Python-friendly."""
    cols = df.columns
    new_cols = []
    for col in cols:
        new_col = col.strip()
        new_col = re.sub(r'[\s\(\)-]+', '_', new_col) # Replace special characters with underscore
        new_col = new_col.lower().rstrip('_')
        new_cols.append(new_col)
    df.columns = new_cols
    return df

df = clean_col_names(df)
df.to_excel(os.path.join(DATA_RESULTS_PATH, 'source_data_for_eda_plots.xlsx'), index=False)
print(f"Source data for EDA saved to: {DATA_RESULTS_PATH}")
print("\nDataset loaded successfully. Column names cleaned.")
print(df.head())

# ==============================================================================
# Step 1: Exploratory Data Analysis (EDA)
# ==============================================================================
print("\n" + "="*60)
print("Step 1: Starting Exploratory Data Analysis (EDA)")
print("="*60)

# Descriptive statistics
desc_stats = df.groupby('class').describe().transpose()
desc_stats.to_excel(os.path.join(EDA_PATH, 'descriptive_statistics_by_class.xlsx'))
desc_stats.to_excel(os.path.join(DATA_RESULTS_PATH, 'eda_descriptive_statistics.xlsx'))
print("\nDescriptive statistics saved to EDA and data_results folders.")

# Identify features to plot
numeric_features = df.select_dtypes(include=np.number).columns.tolist()
target_col = 'class'
features_to_plot = [col for col in df.columns if col != target_col]

print("\nGenerating Box Plots...")
for feature in features_to_plot:
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.boxplot(x=target_col, y=feature, data=df, ax=ax)
    ax.set_title(f'Distribution of {feature} by Class', fontsize=10, fontweight='bold')
    ax.set_xlabel('Class', fontsize=8)
    ax.set_ylabel(feature, fontsize=8)
    ax.tick_params(axis='x', rotation=0, labelsize=8)
    ax.tick_params(axis='y', labelsize=8)
    ax.grid(True, which='major', axis='y', linestyle='--', linewidth=0.5, alpha=0.7)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    save_plot(fig, EDA_PATH, f'boxplot_{feature}')

print("\nGenerating Correlation Matrix...")
correlation_matrix = df[features_to_plot].corr()
correlation_matrix.to_excel(os.path.join(DATA_RESULTS_PATH, 'eda_correlation_matrix_data.xlsx'))
print(f"Correlation matrix data saved to: {DATA_RESULTS_PATH}")

fig, ax = plt.subplots(figsize=(24, 20)) # Increased size for 28 features
sns.heatmap(correlation_matrix, annot=True, fmt='.1f', cmap='coolwarm', ax=ax, annot_kws={"size": 7})
ax.set_title('Correlation Matrix of Numeric Features', fontsize=12, fontweight='bold', pad=20)
ax.tick_params(axis='x', labelsize=8, labelrotation=90)
ax.tick_params(axis='y', labelsize=8, labelrotation=0)
save_plot(fig, EDA_PATH, 'correlation_heatmap', width_cm=30)

print("\nGenerating Distribution Plots...")
for feature in features_to_plot:
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.histplot(data=df, x=feature, hue=target_col, kde=True, element="step", ax=ax, palette='viridis')
    ax.set_title(f'Distribution of {feature} for each Class', fontsize=10, fontweight='bold')
    ax.set_xlabel(feature, fontsize=8)
    ax.set_ylabel('Count', fontsize=8)
    ax.tick_params(labelsize=8)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    legend = ax.get_legend()
    if legend:
        legend.set_title('Class')
        plt.setp(legend.get_texts(), fontsize='8')
        plt.setp(legend.get_title(), fontsize='8')
    save_plot(fig, EDA_PATH, f'distribution_{feature}')

print("\nEDA Step Completed.")

# ==============================================================================
# Step 2: Statistical Analysis
# ==============================================================================
print("\n" + "="*60)
print("Step 2: Starting Statistical Analysis")
print("="*60)

classes = df[target_col].unique()
statistical_results = []
for feature in features_to_plot:
    grouped_data = [df[df[target_col] == c][feature].dropna() for c in classes]
    
    # Shapiro-Wilk test for normality
    is_normal_all_groups = True
    for g in grouped_data:
        if len(g) > 3:
            sample_size = min(len(g), 4999) # Shapiro test limit
            _, p_val = shapiro(g.sample(sample_size, random_state=42))
            if p_val < 0.05:
                is_normal_all_groups = False
                break
        else:
            is_normal_all_groups = False
            break

    # Perform ANOVA or Kruskal-Wallis test
    if is_normal_all_groups:
        test_name = 'ANOVA'
        stat, p_value = f_oneway(*grouped_data)
    else:
        test_name = 'Kruskal-Wallis'
        stat, p_value = kruskal(*grouped_data)
        
    result = {'Feature': feature, 'Test Used': test_name, 'Statistic': stat, 'P-Value': p_value}
    statistical_results.append(result)
    
    # Post-hoc test if significant
    if p_value < 0.05 and len(classes) > 2: # Post-hoc is for more than 2 groups
        posthoc_df = sp.posthoc_dunn(df, val_col=feature, group_col=target_col, p_adjust='bonferroni')
        posthoc_df.to_excel(os.path.join(STATS_PATH, f'posthoc_dunn_{feature}.xlsx'))
        posthoc_df.to_excel(os.path.join(DATA_RESULTS_PATH, f'stats_posthoc_dunn_{feature}.xlsx'))

stats_df = pd.DataFrame(statistical_results)
stats_df.to_excel(os.path.join(STATS_PATH, 'main_statistical_test_results.xlsx'), index=False)
stats_df.to_excel(os.path.join(DATA_RESULTS_PATH, 'stats_main_test_results.xlsx'), index=False)
print("\nStatistical analysis completed. All result data saved to data_results folder.")

# ==============================================================================
# Step 3: Machine Learning Modeling
# ==============================================================================
print("\n" + "="*60)
print("Step 3: Starting Machine Learning Modeling")
print("="*60)

X = df[features_to_plot]
y_raw = df[target_col]

# Encode target labels
le = LabelEncoder()
y = le.fit_transform(y_raw)
n_classes = len(le.classes_)
np.savetxt(os.path.join(DATA_RESULTS_PATH, 'ml_label_encoder_classes.txt'), le.classes_, fmt='%s')
joblib.dump(le, os.path.join(DATA_RESULTS_PATH, 'ml_label_encoder.joblib'))
print(f"\nTarget classes encoded and encoder saved: {list(le.classes_)} -> {list(range(n_classes))}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, os.path.join(DATA_RESULTS_PATH, 'ml_standard_scaler.joblib'))

# Save unscaled test data for re-plotting
X_test_df = pd.DataFrame(X_test, columns=features_to_plot)
y_test_df = pd.DataFrame({'y_test_encoded': y_test, 'y_test_class_name': le.inverse_transform(y_test)}, index=X_test_df.index)
test_data_to_save = pd.concat([X_test_df, y_test_df], axis=1)
test_data_to_save.to_excel(os.path.join(DATA_RESULTS_PATH, 'ml_test_data_unscaled.xlsx'), index=False)
print("\nUnscaled test data and scaler saved for re-plotting purposes.")

# Define models (SVM removed)
models = {
    'Logistic_Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random_Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42)
}
model_paths = {'Logistic_Regression': LR_PATH, 'Random_Forest': RF_PATH, 'XGBoost': XGB_PATH, 'LightGBM': LGBM_PATH}
all_metrics = []

for name, model in models.items():
    print(f"\n--- Processing Model: {name} ---")
    output_path = model_paths[name]
    
    # Fit model and make predictions
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)
    
    # Calculate metrics for binary classification
    metrics = {
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='binary'),
        'Recall': recall_score(y_test, y_pred, average='binary'),
        'F1-score': f1_score(y_test, y_pred, average='binary'),
        'Cohen\'s Kappa': cohen_kappa_score(y_test, y_pred),
        'Matthews Corr Coef': matthews_corrcoef(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_pred_proba[:, 1]),
    }
    all_metrics.append(metrics)
    
    # Save metrics and model
    pd.DataFrame([metrics]).to_excel(os.path.join(output_path, f'{name}_evaluation_metrics.xlsx'), index=False)
    joblib.dump(model, os.path.join(output_path, f'{name}_model.joblib'))
    print(f"Evaluation metrics and model saved for {name}.")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
    cm_df.to_excel(os.path.join(DATA_RESULTS_PATH, f'ml_confusion_matrix_data_{name}.xlsx'))
    
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', ax=ax, annot_kws={"size": 10})
    ax.set_title(f'Confusion Matrix - {name}', fontsize=12, fontweight='bold', pad=20)
    ax.set_xlabel('Predicted Label', fontsize=10)
    ax.set_ylabel('True Label', fontsize=10)
    ax.tick_params(axis='x', labelsize=8)
    ax.tick_params(axis='y', labelsize=8, rotation=0)
    save_plot(fig, output_path, f'{name}_confusion_matrix', width_cm=15)
    
    # SHAP Analysis
    print(f"Performing SHAP analysis for {name}...")
    if name in ['Random_Forest', 'XGBoost', 'LightGBM']:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test_scaled)
    else: # Logistic Regression
        explainer = shap.LinearExplainer(model, X_train_scaled)
        shap_values = explainer.shap_values(X_test_scaled)

    # For binary classification, shap_values can be a single array or a list of two.
    # We plot for the positive class (class 1).
    shap_values_positive_class = shap_values[1] if isinstance(shap_values, list) else shap_values

    X_test_df_for_shap = pd.DataFrame(X_test_scaled, columns=features_to_plot)

    shap.summary_plot(
        shap_values_positive_class, 
        X_test_df_for_shap, 
        show=False,
        plot_size=(15, 12)
    )
    fig = plt.gcf()
    fig.suptitle(f'SHAP Feature Importance for Class "{le.classes_[1]}" - {name}', fontsize=12, fontweight='bold', y=1.0)
    fig.tight_layout(pad=2.0)
    save_plot(fig, output_path, f'{name}_shap_summary_plot', width_cm=25)

print("\nMachine Learning modeling step completed.")

# ==============================================================================
# Step 4: Model Comparison
# ==============================================================================
print("\n" + "="*60)
print("Step 4: Starting Model Comparison")
print("="*60)

all_metrics_df = pd.DataFrame(all_metrics).set_index('Model')
all_metrics_df.to_excel(os.path.join(COMPARE_PATH, 'all_models_comparison_metrics.xlsx'))
all_metrics_df.to_excel(os.path.join(DATA_RESULTS_PATH, 'comparison_all_models_metrics.xlsx'))
print("Comparison metrics data saved.")
print(all_metrics_df)

# Bar chart for key metrics
metrics_to_compare = ['Accuracy', 'F1-score', 'ROC AUC']
fig, ax = plt.subplots(figsize=(10, 6))
all_metrics_df[metrics_to_compare].plot(kind='bar', ax=ax, rot=0, width=0.7)
ax.set_title('Comparison of Key Metrics Across Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Score', fontsize=10)
ax.set_xlabel('Model', fontsize=10)
ax.set_ylim(0.8, 1.0)
ax.tick_params(axis='x', labelsize=9)
ax.tick_params(axis='y', labelsize=9)
ax.legend(title='Metric', fontsize=9, title_fontsize='9')
ax.grid(axis='y', linestyle='--', alpha=0.7)
save_plot(fig, COMPARE_PATH, 'models_metrics_comparison_bar_chart')

# --- Comparative ROC Curves ---
fig_roc, ax_roc = plt.subplots(figsize=(8, 8))
for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    roc_data_df = pd.DataFrame({'fpr': fpr, 'tpr': tpr})
    roc_data_df.to_excel(os.path.join(DATA_RESULTS_PATH, f'comparison_roc_curve_data_{name}.xlsx'), index=False)
    
    ax_roc.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

ax_roc.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance')
ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
ax_roc.set_xlabel('False Positive Rate', fontsize=10)
ax_roc.set_ylabel('True Positive Rate', fontsize=10)
ax_roc.set_title('Comparative ROC Curves', fontsize=12, fontweight='bold')
ax_roc.legend(loc="lower right", fontsize=9)
ax_roc.tick_params(labelsize=9)
ax_roc.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
ax_roc.set_aspect('equal', adjustable='box')
save_plot(fig_roc, COMPARE_PATH, 'all_models_roc_curves', width_cm=15)

# --- Comparative Precision-Recall Curves ---
print("\nGenerating Comparative Precision-Recall Curves...")
fig_pr, ax_pr = plt.subplots(figsize=(8, 8))
no_skill = len(y_test[y_test==1]) / len(y_test)
ax_pr.plot([0, 1], [no_skill, no_skill], 'k--', lw=2, label=f'No-Skill (AP={no_skill:.3f})')

for name, model in models.items():
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)

    pr_data_df = pd.DataFrame({'recall': recall, 'precision': precision})
    pr_data_df.to_excel(os.path.join(DATA_RESULTS_PATH, f'comparison_pr_curve_data_{name}.xlsx'), index=False)

    ax_pr.plot(recall, precision, lw=2, label=f'{name} (AP = {avg_precision:.3f})')

ax_pr.set_xlim([0.0, 1.0]); ax_pr.set_ylim([0.0, 1.05])
ax_pr.set_xlabel('Recall', fontsize=10)
ax_pr.set_ylabel('Precision', fontsize=10)
ax_pr.set_title('Comparative Precision-Recall Curves', fontsize=12, fontweight='bold')
ax_pr.legend(loc="lower left", fontsize=9)
ax_pr.tick_params(labelsize=9)
ax_pr.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
ax_pr.set_aspect('equal', adjustable='box')
save_plot(fig_pr, COMPARE_PATH, 'all_models_pr_curves', width_cm=15)

print("\nModel comparison step completed.")
print("\n" + "="*60)
print("SCRIPT EXECUTION FINISHED SUCCESSFULLY!")
print("="*60)

All directories are created or already exist.
Source data for EDA saved to: /home/sajad/Sajad_test/Pistachio Dataset/data_results

Dataset loaded successfully. Column names cleaned.
    area  perimeter  major_axis  minor_axis  eccentricity   eqdiasq  solidity  \
0  63391   1568.405    390.3396    236.7461        0.7951  284.0984    0.8665   
1  68358   1942.187    410.8594    234.7525        0.8207  295.0188    0.8765   
2  73589   1246.538    452.3630    220.5547        0.8731  306.0987    0.9172   
3  71106   1445.261    429.5291    216.0765        0.8643  300.8903    0.9589   
4  80087   1251.524    469.3783    220.9344        0.8823  319.3273    0.9657   

   convex_area  extent  aspect_ratio  ...  stddev_rr  stddev_rg  stddev_rb  \
0        73160  0.6394        1.6488  ...    17.7206    19.6024    21.1342   
1        77991  0.6772        1.7502  ...    26.7061    27.2112    25.1035   
2        80234  0.7127        2.0510  ...    19.0129    20.0703    20.7006   
3        74153  0.7

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluation metrics and model saved for XGBoost.
Plot saved: XGBoost_confusion_matrix.svg and XGBoost_confusion_matrix.pdf in /home/sajad/Sajad_test/Pistachio Dataset/XGBoost
Performing SHAP analysis for XGBoost...
Plot saved: XGBoost_shap_summary_plot.svg and XGBoost_shap_summary_plot.pdf in /home/sajad/Sajad_test/Pistachio Dataset/XGBoost

--- Processing Model: LightGBM ---
[LightGBM] [Info] Number of positive: 733, number of negative: 985
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6695
[LightGBM] [Info] Number of data points in the train set: 1718, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.426659 -> initscore=-0.295496
[LightGBM] [Info] Start training from score -0.295496




Evaluation metrics and model saved for LightGBM.
Plot saved: LightGBM_confusion_matrix.svg and LightGBM_confusion_matrix.pdf in /home/sajad/Sajad_test/Pistachio Dataset/LightGBM
Performing SHAP analysis for LightGBM...
Plot saved: LightGBM_shap_summary_plot.svg and LightGBM_shap_summary_plot.pdf in /home/sajad/Sajad_test/Pistachio Dataset/LightGBM

Machine Learning modeling step completed.

Step 4: Starting Model Comparison
Comparison metrics data saved.
                     Accuracy  Precision    Recall  F1-score  Cohen's Kappa  \
Model                                                                         
Logistic_Regression  0.909302   0.909091  0.874317  0.891365       0.813571   
Random_Forest        0.893023   0.882682  0.863388  0.872928       0.780578   
XGBoost              0.923256   0.916667  0.901639  0.909091       0.842700   
LightGBM             0.927907   0.922222  0.907104  0.914601       0.852234   

                     Matthews Corr Coef   ROC AUC  
Model         



Plot saved: all_models_roc_curves.svg and all_models_roc_curves.pdf in /home/sajad/Sajad_test/Pistachio Dataset/compairation

Generating Comparative Precision-Recall Curves...
Plot saved: all_models_pr_curves.svg and all_models_pr_curves.pdf in /home/sajad/Sajad_test/Pistachio Dataset/compairation

Model comparison step completed.

SCRIPT EXECUTION FINISHED SUCCESSFULLY!


Reproduce Figures 

In [None]:

# ==============================================================================
# کد 2: اسکریپت بازتولید نمودارها از نتایج ذخیره‌شده
# ==============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import joblib
import shap
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score

# --- Configuration ---
sns.set_theme(style="whitegrid")

# --- Define Paths (باید با اسکریپت اصلی یکسان باشد) ---
BASE_PATH = '/home/sajad/Sajad_test/Pistachio Dataset/'
LR_PATH = os.path.join(BASE_PATH, 'Logistic_Regression')
RF_PATH = os.path.join(BASE_PATH, 'Random_Forest')
XGB_PATH = os.path.join(BASE_PATH, 'XGBoost')
LGBM_PATH = os.path.join(BASE_PATH, 'LightGBM')
COMPARE_PATH = os.path.join(BASE_PATH, 'compairation')
DATA_RESULTS_PATH = os.path.join(BASE_PATH, 'data_results')

# --- Helper Function for Saving Plots ---
def save_plot(fig, path, filename, width_cm=13):
    """Saves a matplotlib figure in multiple formats with specified width."""
    width_in = width_cm / 2.54
    fig.set_size_inches(width_in, fig.get_figheight() * (width_in / fig.get_figwidth()))
    fig.savefig(os.path.join(path, f"{filename}.svg"), format='svg', bbox_inches='tight')
    fig.savefig(os.path.join(path, f"{filename}.pdf"), format='pdf', bbox_inches='tight')
    plt.close(fig)
    print(f"Re-plotted and saved: {filename}.svg and {filename}.pdf in {path}")

# ==============================================================================
# Step 1: Load Pre-computed Data and Saved Models
# ==============================================================================
print("\n" + "="*60)
print("Step 1: Loading all necessary pre-computed data and models...")
print("="*60)

try:
    # Load essential components
    le = joblib.load(os.path.join(DATA_RESULTS_PATH, 'ml_label_encoder.joblib'))
    scaler = joblib.load(os.path.join(DATA_RESULTS_PATH, 'ml_standard_scaler.joblib'))
    test_data = pd.read_excel(os.path.join(DATA_RESULTS_PATH, 'ml_test_data_unscaled.xlsx'))
    all_metrics_df = pd.read_excel(os.path.join(DATA_RESULTS_PATH, 'comparison_all_models_metrics.xlsx'), index_col=0)
    
    # Separate test data
    features_to_plot = [col for col in test_data.columns if col not in ['y_test_encoded', 'y_test_class_name']]
    X_test = test_data[features_to_plot]
    y_test = test_data['y_test_encoded'].values
    
    # Scale test data for models
    X_test_scaled = scaler.transform(X_test)
    
    print("All necessary files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please run the main script first to generate the results.")
    exit()

# Define models and paths (SVM removed)
models_to_plot = {
    'Logistic_Regression': LR_PATH,
    'Random_Forest': RF_PATH,
    'XGBoost': XGB_PATH,
    'LightGBM': LGBM_PATH
}

# ==============================================================================
# Step 2: Re-generate Plots for Each Model
# ==============================================================================
print("\n" + "="*60)
print("Step 2: Re-generating plots for each model...")
print("="*60)

for name, output_path in models_to_plot.items():
    print(f"\n--- Re-plotting for Model: {name} ---")
    
    # Load the saved model
    model = joblib.load(os.path.join(output_path, f'{name}_model.joblib'))
    
    # Re-generate Confusion Matrix Plot
    cm_df = pd.read_excel(os.path.join(DATA_RESULTS_PATH, f'ml_confusion_matrix_data_{name}.xlsx'), index_col=0)
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', ax=ax, annot_kws={"size": 10})
    ax.set_title(f'Confusion Matrix - {name}', fontsize=12, fontweight='bold', pad=20)
    ax.set_xlabel('Predicted Label', fontsize=10)
    ax.set_ylabel('True Label', fontsize=10)
    ax.tick_params(axis='x', labelsize=8)
    ax.tick_params(axis='y', labelsize=8, rotation=0)
    save_plot(fig, output_path, f'{name}_confusion_matrix', width_cm=15)
    
    # Re-calculate and Re-generate SHAP Plots
    print(f"Re-calculating SHAP values for {name}...")
    if name in ['Random_Forest', 'XGBoost', 'LightGBM']:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test_scaled)
    else: # Logistic Regression
        # Re-load training data just for SHAP background
        X_train_scaled_for_shap = scaler.transform(pd.read_excel(DATA_PATH).drop('CLASS', axis=1, errors='ignore'))
        background_data = shap.sample(X_train_scaled_for_shap, 100)
        explainer = shap.LinearExplainer(model, background_data)
        shap_values = explainer.shap_values(X_test_scaled)
        
    shap_values_positive_class = shap_values[1] if isinstance(shap_values, list) else shap_values

    X_test_df_for_shap = pd.DataFrame(X_test_scaled, columns=features_to_plot)

    shap.summary_plot(
        shap_values_positive_class, 
        X_test_df_for_shap, 
        show=False,
        plot_size=(15, 12)
    )
    fig = plt.gcf()
    fig.suptitle(f'SHAP Feature Importance for Class "{le.classes_[1]}" - {name}', fontsize=12, fontweight='bold', y=1.0)
    fig.tight_layout(pad=2.0)
    save_plot(fig, output_path, f'{name}_shap_summary_plot', width_cm=25)

# ==============================================================================
# Step 3: Re-generate Comparison Plots
# ==============================================================================
print("\n" + "="*60)
print("Step 3: Re-generating comparison plots...")
print("="*60)

# Bar chart for key metrics
metrics_to_compare = ['Accuracy', 'F1-score', 'ROC AUC']
fig, ax = plt.subplots(figsize=(10, 6))
all_metrics_df[metrics_to_compare].plot(kind='bar', ax=ax, rot=0, width=0.7)
ax.set_title('Comparison of Key Metrics Across Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Score', fontsize=10)
ax.set_xlabel('Model', fontsize=10)
ax.set_ylim(0.8, 1.0)
ax.tick_params(axis='x', labelsize=9)
ax.tick_params(axis='y', labelsize=9)
ax.legend(title='Metric', fontsize=9, title_fontsize='9')
ax.grid(axis='y', linestyle='--', alpha=0.7)
save_plot(fig, COMPARE_PATH, 'models_metrics_comparison_bar_chart')

# --- Comparative ROC Curves ---
fig_roc, ax_roc = plt.subplots(figsize=(8, 8))
for name in models_to_plot.keys():
    roc_data_df = pd.read_excel(os.path.join(DATA_RESULTS_PATH, f'comparison_roc_curve_data_{name}.xlsx'))
    roc_auc = auc(roc_data_df['fpr'], roc_data_df['tpr'])
    ax_roc.plot(roc_data_df['fpr'], roc_data_df['tpr'], lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

ax_roc.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance')
ax_roc.set_xlim([0.0, 1.0]); ax_roc.set_ylim([0.0, 1.05])
ax_roc.set_xlabel('False Positive Rate', fontsize=10)
ax_roc.set_ylabel('True Positive Rate', fontsize=10)
ax_roc.set_title('Comparative ROC Curves', fontsize=12, fontweight='bold')
ax_roc.legend(loc="lower right", fontsize=9)
ax_roc.tick_params(labelsize=9)
ax_roc.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
ax_roc.set_aspect('equal', adjustable='box')
save_plot(fig_roc, COMPARE_PATH, 'all_models_roc_curves', width_cm=15)

# --- Comparative Precision-Recall Curves ---
fig_pr, ax_pr = plt.subplots(figsize=(8, 8))
no_skill = len(y_test[y_test==1]) / len(y_test)
ax_pr.plot([0, 1], [no_skill, no_skill], 'k--', lw=2, label=f'No-Skill (AP={no_skill:.3f})')

for name in models_to_plot.keys():
    pr_data_df = pd.read_excel(os.path.join(DATA_RESULTS_PATH, f'comparison_pr_curve_data_{name}.xlsx'))
    avg_precision = auc(pr_data_df['recall'], pr_data_df['precision'])
    ax_pr.plot(pr_data_df['recall'], pr_data_df['precision'], lw=2, label=f'{name} (AP = {avg_precision:.3f})')

ax_pr.set_xlim([0.0, 1.0]); ax_pr.set_ylim([0.0, 1.05])
ax_pr.set_xlabel('Recall', fontsize=10)
ax_pr.set_ylabel('Precision', fontsize=10)
ax_pr.set_title('Comparative Precision-Recall Curves', fontsize=12, fontweight='bold')
ax_pr.legend(loc="lower left", fontsize=9)
ax_pr.tick_params(labelsize=9)
ax_pr.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
ax_pr.set_aspect('equal', adjustable='box')
save_plot(fig_pr, COMPARE_PATH, 'all_models_pr_curves', width_cm=15)

print("\nRe-plotting process completed successfully!")
print("\n" + "="*60)
print("RE-PLOTTING SCRIPT FINISHED!")
print("="*60)