In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Load dataset
file_path = 'data_public.csv'  # Replace with the actual file path
data = pd.read_csv(file_path)

# Remove Outliers Using IQR
def remove_outliers_iqr(df, iqr_multiplier=1.5):
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - iqr_multiplier * IQR
        upper_bound = Q3 + iqr_multiplier * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Apply outlier removal
data_cleaned = remove_outliers_iqr(data)

# Calculate class distribution after removing outliers
class_distribution = data_cleaned['Class'].value_counts()

# Print the class distribution
print("Class Distribution After Removing Outliers:")
print(class_distribution)


# Split features and target
X = data_cleaned.drop(columns=['Class'])
y = data_cleaned['Class']

# Balance the dataset using Random Oversampling
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(X, y)

# List of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(penalty='l2', solver='liblinear', random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

# Perform K-Fold Cross-Validation for each classifier
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for clf_name, clf in classifiers.items():
    print(f"\nEvaluating {clf_name}...\n")
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
        ('scaler', RobustScaler()),                  # Scale using RobustScaler
        ('selector', SelectKBest(score_func=f_classif, k=10)),  # Select top 10 features
        ('classifier', clf)  # Dynamic classifier
    ])
    
    all_reports = []
    all_roc_aucs = []

    for train_idx, test_idx in kfold.split(X_balanced, y_balanced):
        X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
        y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]
        
        # Train pipeline
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test) if hasattr(clf, "predict_proba") else None
        
        # Calculate ROC-AUC Score if possible
        if y_pred_proba is not None:
            if y_pred_proba.shape[1] == 2:  # Binary classification
                roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])  # Use positive class probabilities
            else:  # Multiclass classification
                roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")
            all_roc_aucs.append(roc_auc)
        else:
            all_roc_aucs.append(None)
        
        # Generate classification report
        report = classification_report(y_test, y_pred, output_dict=True,zero_division=0)
        all_reports.append(report)

    # Compute average metrics across all folds
    avg_report = {
        "accuracy": sum(r["accuracy"] for r in all_reports) / len(all_reports),
        "weighted avg": {
            "precision": sum(r["weighted avg"]["precision"] for r in all_reports) / len(all_reports),
            "recall": sum(r["weighted avg"]["recall"] for r in all_reports) / len(all_reports),
            "f1-score": sum(r["weighted avg"]["f1-score"] for r in all_reports) / len(all_reports),
            "support": sum(r["weighted avg"]["support"] for r in all_reports),
        },
    }
    avg_roc_auc = sum([auc for auc in all_roc_aucs if auc is not None]) / len([auc for auc in all_roc_aucs if auc is not None])

    # Print average metrics
    print(f"Average Classification Metrics for {clf_name}:")
    print(f"Accuracy: {avg_report['accuracy']:.4f}")
    print(f"Weighted Precision: {avg_report['weighted avg']['precision']:.4f}")
    print(f"Weighted Recall: {avg_report['weighted avg']['recall']:.4f}")
    print(f"Weighted F1-Score: {avg_report['weighted avg']['f1-score']:.4f}")
    if avg_roc_auc is not None:
        print(f"Average ROC-AUC Score: {avg_roc_auc:.3f}")

    # Fit the pipeline on the entire dataset and save as ONNX
    pipeline.fit(X_balanced, y_balanced)
    initial_type = [('float_input', FloatTensorType([None, X_balanced.shape[1]]))]
    onnx_model = convert_sklearn(pipeline, initial_types=initial_type)
    onnx_file = f"{clf_name.lower().replace(' ', '_')}_pipeline_balanced.onnx"
    with open(onnx_file, "wb") as f:
        f.write(onnx_model.SerializeToString())
    print(f"Model exported to {onnx_file}")


Class Distribution After Removing Outliers:
Class
3    463744
2    449694
1     78435
Name: count, dtype: int64

Evaluating Logistic Regression...

Average Classification Metrics for Logistic Regression:
Accuracy: 0.6667
Weighted Precision: 0.4507
Weighted Recall: 0.6667
Weighted F1-Score: 0.5360
Average ROC-AUC Score: 0.780
Model exported to logistic_regression_pipeline_balanced.onnx

Evaluating Decision Tree...

Average Classification Metrics for Decision Tree:
Accuracy: 0.7809
Weighted Precision: 0.7729
Weighted Recall: 0.7809
Weighted F1-Score: 0.7740
Average ROC-AUC Score: 0.836
Model exported to decision_tree_pipeline_balanced.onnx

Evaluating Gradient Boosting...

Average Classification Metrics for Gradient Boosting:
Accuracy: 0.6672
Weighted Precision: 0.7222
Weighted Recall: 0.6672
Weighted F1-Score: 0.5375
Average ROC-AUC Score: 0.786
