In [None]:
from sklearn.model_selection import train_test_split
import onnxruntime as rt
import onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import to_onnx
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from sklearn.preprocessing import StandardScaler
# define a XGBoost classifier
import xgboost as xgb
import warnings
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from aif360.sklearn.datasets import fetch_compas
from aif360.sklearn.metrics import disparate_impact_ratio, consistency_score, generalized_entropy_error
from aif360.sklearn.detectors import bias_scan
from aif360.sklearn.inprocessing import AdversarialDebiasing
from aif360.datasets import BinaryLabelDataset, StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric
from xgboost import XGBClassifier
warnings.filterwarnings("ignore")  # Ignore runtime warnings
# Temporarily adjust pandas display settings for large DataFrames
pd.set_option('display.max_rows', 100)  # Ensure 100 rows can be displayed
pd.set_option('display.max_columns', None)  # Ensure all columns can be displayed
pd.set_option('display.width', None)  # Automatically adjust display width to terminal size
pd.set_option('display.max_colwidth', None)  # Ensure full width of column content is shown
pd.set_option('display.float_format', '{:.4f}'.format)  # Format the float numbers for better readability


In [None]:
data = pd.read_csv('C:\\Users\\91948\\Desktop\\SE\\Software-Testing-Project\\data\\investigation_train_large_checked_adjusted2.csv')

In [None]:
class ZeroFeaturesByKeywords(BaseEstimator, TransformerMixin):
    def __init__(self, non_fair_keywords=None, to_remove=None):
        # Initialize keywords and columns to remove
        self.non_fair_keywords = non_fair_keywords if non_fair_keywords else [
            "relatie_kind", "relatie_partner", "ontheffing", "belemmering", 
            "competentie", "persoon", "persoonlijke", "adres", 
            "typering_dagen_som", "Ja", "Nee"
        ]
        self.to_remove = to_remove if to_remove else [
            "persoon_leeftijd_bij_onderzoek", "persoon_geslacht_vrouw",
            "belemmering_financiele_problemen", "persoonlijke_eigenschappen_taaleis_voldaan",
            "relatie_kind_heeft_kinderen"
        ]
        self.zeroed_columns_ = []

    def fit(self, X, y=None):
        # Identify columns to be zeroed (not modified in-place)
        self.feature_names_in_ = list(X.columns)

        # Identify columns to be zeroed
        self.zeroed_columns_ = [
        col for col in X.columns 
        if any(col.startswith(keyword) for keyword in self.non_fair_keywords) and col not in self.to_remove
        ]
        return self

    def transform(self, X):
        # Apply zeroing transformation
        X_copy = X.copy()
        for col in self.zeroed_columns_:
            X_copy[col] = 0
        return X_copy


In [None]:
# Step 2: Define the target and features
y = data['checked']  # Assuming 'checked' is the target column
X = data.drop(['checked'], axis=1)

# Ensure features are float32
X = X.astype(np.float32)

# Step 3: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y
)
classifier = XGBClassifier(max_depth=25, n_estimators=200)

# Step 6: Create the pipeline
pipeline = Pipeline([
    ('zero_features', ZeroFeaturesByKeywords()),  # Zero features transformer
    ('scaling', StandardScaler()),                # Scaling step
    ('classification', classifier)               # Final classifier
])

# Step 7: Train the pipeline
pipeline.fit(X_train, y_train)

In [None]:
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
)
from skl2onnx import update_registered_converter
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
update_registered_converter(
    XGBClassifier,  # The model class
    "XGBoostXGBClassifier",  # The operator name
    calculate_linear_classifier_output_shapes,  # Shape calculator
    convert_xgboost, 
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)

In [None]:
# Let's convert the model to ONNX
onnx_model = convert_sklearn(
    pipeline, initial_types=[('X', FloatTensorType((None, X.shape[1])))],
    target_opset={'': 12, 'ai.onnx.ml': 3})

onnx.save(onnx_model, "modelXGBnormal.onnx")