In [None]:
%pip install sdv ucimlrepo qde

In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
statlog_german_credit_data = fetch_ucirepo(id=144)

# data (as pandas dataframes)
X = statlog_german_credit_data.data.features
y = statlog_german_credit_data.data.targets

In [3]:
data_frame = X.copy()
data_frame["target"] = y

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Find categorical (non-numeric) columns
categorical_cols = data_frame.select_dtypes(include=['object', 'category']).columns

# Label encode each categorical column
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data_frame[col] = le.fit_transform(data_frame[col].astype(str))  # Ensure all values are strings
    label_encoders[col] = le  # Store encoder if you need to inverse_transform later

In [5]:
print(data_frame.head())

   Attribute1  Attribute2  Attribute3  Attribute4  Attribute5  Attribute6  \
0           0           6           4           4        1169           4   
1           1          48           2           4        5951           0   
2           3          12           4           7        2096           0   
3           0          42           2           3        7882           0   
4           0          24           3           0        4870           0   

   Attribute7  Attribute8  Attribute9  Attribute10  ...  Attribute12  \
0           4           4           2            0  ...            0   
1           2           2           1            0  ...            0   
2           3           2           2            0  ...            0   
3           3           2           2            2  ...            1   
4           2           3           2            0  ...            3   

   Attribute13  Attribute14  Attribute15  Attribute16  Attribute17  \
0           67            2       

In [6]:
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer
from sdv.sampling import Condition

metadata = Metadata.detect_from_dataframe(data=data_frame, table_name='gcc')
metadata.update_column(column_name='target', sdtype='categorical')
metadata.validate()
metadata.save_to_json(f'metadata.json')

In [7]:
ctgan = CTGANSynthesizer(metadata)
ctgan.fit(data_frame)
synth_data = ctgan.sample(num_rows=7000)

In [8]:
print(synth_data.head())

   Attribute1  Attribute2  Attribute3  Attribute4  Attribute5  Attribute6  \
0           0           9           2           0        1547           1   
1           1           4           2           7        2510           0   
2           3          14           0           3        2000           0   
3           2          20           4           4        8814           0   
4           0          13           2           4        4131           0   

   Attribute7  Attribute8  Attribute9  Attribute10  ...  Attribute12  \
0           3           4           2            0  ...            2   
1           4           4           2            0  ...            2   
2           4           2           2            0  ...            0   
3           4           4           2            1  ...            3   
4           4           2           2            0  ...            1   

   Attribute13  Attribute14  Attribute15  Attribute16  Attribute17  \
0           43            2       

In [9]:
train_X = data_frame.drop(columns=["target"])
train_y = data_frame["target"]

synth_X = synth_data.drop(columns=["target"])
synth_y = synth_data["target"]

test_X = data_frame.drop(columns=["target"])
test_y = data_frame["target"]

In [10]:
from qde.qde import QDE
from sklearn.naive_bayes import GaussianNB

estimator = GaussianNB()

qde_obj = QDE(default_strategy="oes")

qde_obj.fit(
    train_X=train_X, train_y=train_y,
    syn_X=synth_X, syn_y=synth_y,
    test_X=test_X, test_y=test_y,
    strategy="oes",
    estimator=estimator,
    encode_labels=True,
)

result, X_sel, y_sel = qde_obj.extract(
    estimator=estimator,
    compute_filtered_accuracy=True,
    k_neighbors=7,
    distance_mode="cosine",
)

In [11]:
# Report
print(f"Selected indices (first 20): {result.indices[:20].tolist()}")
print(f"Accepted count: {len(result.indices)} / synth_size={len(synth_X)}")
if hasattr(result, "meta") and result.meta:
    for k, v in result.meta.items():
        print(f"  {k}: {v}")

Selected indices (first 20): [12, 18, 22, 23, 25, 30, 36, 38, 40, 50, 51, 55, 61, 62, 66, 68, 79, 87, 92, 102]
Accepted count: 1333 / synth_size=7000
  strategy: oes
  selected-samples: 1333
  original-accuracy: 0.742
  augmented-accuracy: 0.686
  filtered-accuracy: 0.751
