In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report


data = pd.read_csv('data\\br2000_mrf.csv')


def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity/epsilon)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,2,1,3,0,0,1,0,0,6,1,5,0
1,0,1,1,0,7,0,0,1,0,0,6,1,7,1
2,1,1,2,1,6,0,0,1,0,3,4,1,8,1
3,0,0,1,0,7,1,0,1,0,0,4,1,6,0
4,0,0,6,0,1,1,0,0,0,1,8,1,2,0


In [2]:
#generate Phase 1

def generate(df, numTuples=None, numClass=None, epsilon=1.0):
    print(df.shape)
    #optionale parameter abklären
    if numTuples is None:
        numTuples = df.shape[0]
    if numClass is None:
        numClass = df.shape[1] - 1

    className = df.columns[numClass]
    attributesName = [col for col in df.columns if col != className]

    numHistograms = df.shape[1] - 1
    epsilon_per_hist = epsilon/numHistograms

    histograms = {} # Dict für jedes Marginal nach Attribut sortiert
    for attributeName in attributesName:
        counts = df[[attributeName, className]].value_counts() # Series mit MultiIndex, Ebene 0 = Attribut, Ebene 1 = Klasse
        noisy_counts = laplace_mech(counts, 1, epsilon_per_hist)
        noisy_counts.clip(lower=0.0, inplace=True) # Entfernen von negativen Werten
        #noisy_counts = noisy_counts.round()
        histograms[attributeName] = noisy_counts

    print(histograms["1"])

generate(data)


(38000, 14)
1  13
0  0     20354.057133
1  0      6228.057133
   1      5161.057133
0  1      3378.057133
2  1      1580.057133
   0       795.057133
3  1       265.057133
   0       108.057133
4  1        71.057133
   0        30.057133
5  1        15.057133
6  1         6.057133
5  0         5.057133
6  0         4.057133
Name: count, dtype: float64


In [3]:
import numpy as np
import pandas as pd

# ---------- Hilfsfunktionen ----------

def _build_noisy_histograms(df, className, attributesName, epsilon):
    """
    Phase 1: Baue für jedes Attribut ein DP-noisy Histogramm P(attr, class).
    """
    numHistograms = len(attributesName)
    epsilon_per_hist = epsilon / numHistograms
    histograms = {}

    for attributeName in attributesName:
        counts = df[[attributeName, className]].value_counts()
        noisy_counts = laplace_mech(counts, sensitivity=1, epsilon=epsilon_per_hist)
        noisy_counts.clip(lower=0.0, inplace=True)
        histograms[attributeName] = noisy_counts

    return histograms


def _compute_class_distribution(histograms, numTuples):
    """
    Aus allen Histogrammen: classTotals, Klassenwahrscheinlichkeiten p(c),
    Anzahl synthetischer Tupel pro Klasse und Klassenvektoren.
    """
    classTotals = {}

    # classTotals[c] = Summe über alle Attribute und Attribute-Werte der Counts
    for attr_name, hist in histograms.items():
        for (attr_val, class_val), count in hist.items():
            classTotals[class_val] = classTotals.get(class_val, 0.0) + count

    total = sum(classTotals.values())
    if total == 0:
        # Fallback: gleichverteilt, falls durch Noise alles 0 wurde
        gleich = 1.0 / len(classTotals)
        p = {c: gleich for c in classTotals}
    else:
        p = {c: classTotals[c] / total for c in classTotals}

    # Anzahl Tupel pro Klasse
    classTuples = {c: round(numTuples * p[c]) for c in classTotals}

    # Klassenvektor für jede Klasse
    class_vector = {c: [c] * classTuples[c] for c in classTuples}

    return classTotals, p, classTuples, class_vector


def _compute_conditional_attributes(histograms, classTotals):
    """
    Berechne P(attr = a | class = c) für jedes Attribut und jede Klasse.
    """
    cond_attr = {}

    for attr_name, hist in histograms.items():
        cond_attr[attr_name] = {}
        for c in classTotals:
            attr_counts = {}

            # Zähle für fixe Klasse c die Häufigkeiten der Attribut-Werte
            for (attr_val, class_val), count in hist.items():
                if class_val == c:
                    attr_counts[attr_val] = attr_counts.get(attr_val, 0.0) + count

            sum_c = sum(attr_counts.values())

            if sum_c == 0:
                # gleichverteilte Notlösung
                if attr_counts:  # Klasseninfo vorhanden, aber alles 0
                    gleich = 1.0 / len(attr_counts)
                    probs = {a: gleich for a in attr_counts}
                else:
                    # gar keine Werte – leeres Dict, kann später geskippt werden
                    probs = {}
            else:
                probs = {a: attr_counts[a] / sum_c for a in attr_counts}

            cond_attr[attr_name][c] = probs

    return cond_attr


def _sample_attribute_vectors(cond_attr, classTuples, random_state=None):
    """
    Ziehe für jedes Attribut und jede Klasse einen Vektor von Attributwerten
    mit Länge n_c = classTuples[c] entsprechend P(attr | class).
    """
    if random_state is not None:
        np.random.seed(random_state)

    attr_vectors = {}

    for attr_name, class_dict in cond_attr.items():
        attr_vectors[attr_name] = {}
        for c, probs in class_dict.items():
            n_c = classTuples[c]

            # Falls keine Wahrscheinlichkeiten existieren (leeres Dict)
            if not probs:
                # Notlösung: Vektor komplett leer, wird später ggf. ersetzt/geskippt
                attr_vectors[attr_name][c] = [None] * n_c
                continue

            # Zielanzahl pro Attributwert
            target_count = {attr_val: round(p_val * n_c)
                            for attr_val, p_val in probs.items()}

            # Baue den Vektor mit der Zielanzahl pro Wert
            attr_vec_c = []
            for attr_val, cnt in target_count.items():
                attr_vec_c.extend([attr_val] * cnt)

            # Länge anpassen (auf n_c)
            current_len = len(attr_vec_c)
            diff = n_c - current_len

            if diff > 0:
                vals = list(probs.keys())
                extra = np.random.choice(vals, size=diff, p=list(probs.values()))
                attr_vec_c.extend(extra)
            elif diff < 0:
                remove_indices = np.random.choice(len(attr_vec_c), size=-diff, replace=False)
                for idx in sorted(remove_indices, reverse=True):
                    attr_vec_c.pop(idx)

            # Shuffle, um keine Struktur / Reihenfolge zu verraten
            np.random.shuffle(attr_vec_c)

            attr_vectors[attr_name][c] = attr_vec_c

    return attr_vectors


def _assemble_synthetic_dataframe(attr_vectors, class_vector, attributesName, className):
    """
    Setze aus Attributvektoren und Klassenvektoren die DataFrames pro Klasse
    und führe sie zu einem Gesamt-DataFrame zusammen.
    """
    blocks = {}
    for c, class_vec in class_vector.items():
        n_c = len(class_vec)
        block = {}

        for attributeName in attributesName:
            values = attr_vectors[attributeName][c]
            # Sanity-Check: auf Länge n_c trimmen/auffüllen falls nötig
            if len(values) < n_c:
                values = values + [values[-1]] * (n_c - len(values))
            elif len(values) > n_c:
                values = values[:n_c]
            block[attributeName] = values

        block[className] = class_vec
        blocks[c] = block

    df_blocks = {c: pd.DataFrame(block) for c, block in blocks.items()}
    synthetic_df = pd.concat(df_blocks.values(), ignore_index=True)

    return synthetic_df


# ---------- Hauptfunktion ----------

def generate(df, numTuples=None, numClass=None, epsilon=1.0, random_state=42):
    """
    Erzeuge differentially private synthetische Daten auf Basis von df.
    Phase 1: DP-Histogramme.
    Phase 2: Ziehen von Klassen- und Attributwerten und Zusammenbau des synthetischen Datensatzes.
    """
    print(df.shape)

    # optionale Parameter abklären
    if numTuples is None:
        numTuples = df.shape[0]
    if numClass is None:
        numClass = df.shape[1] - 1

    className = df.columns[numClass]
    attributesName = [col for col in df.columns if col != className]

    # ---- Phase 1: DP-Histogramme ----
    histograms = _build_noisy_histograms(df, className, attributesName, epsilon)

    # ---- Phase 2: Klassenverteilung, P(attr|class), Sampling, Zusammenbau ----
    classTotals, p, classTuples, class_vector = _compute_class_distribution(
        histograms, numTuples
    )

    cond_attr = _compute_conditional_attributes(histograms, classTotals)

    attr_vectors = _sample_attribute_vectors(
        cond_attr, classTuples, random_state=random_state
    )

    synthetic_df = _assemble_synthetic_dataframe(
        attr_vectors, class_vector, attributesName, className
    )

    # final shuffle des gesamten DataFrames (Zeilen)
    synthetic_df = synthetic_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    return synthetic_df

# Beispiel-Aufruf:
# synthetic_data = generate(data, epsilon=1.0)


In [4]:
#GPT Zelle
def build_pipeline(clf=None):
    """
    Baut eine sklearn-Pipeline:
    - OneHotEncoding für alle Feature-Spalten (alle außer der letzten)
    - Classifier (default: RandomForest, kann aber übergeben werden)
    """
    if clf is None:
        clf = RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            n_jobs=-1
        )

    # Alle Spalten außer der letzten sind Features
    # (funktioniert auch mit '0', '1', ..., '13' als Spaltennamen)
    def make_preprocessor(df):
        feature_cols = df.columns[:-1]
        preprocessor = ColumnTransformer(
            transformers=[
                ("cat", OneHotEncoder(handle_unknown="ignore"), feature_cols)
            ]
        )
        return preprocessor

    # kleine Wrapperfunktion, damit wir df nicht global brauchen
    def make_model(df):
        preprocessor = make_preprocessor(df)
        model = Pipeline(steps=[
            ("prep", preprocessor),
            ("clf", clf)
        ])
        return model

    return make_model


def evaluate_df(df, clf=None, n_splits=10, test_size=0.2, base_random_state=0):
    """
    - df: DataFrame mit letzter Spalte = Klasse
    - clf: optionaler Classifier (sonst RandomForest)
    - n_splits: wie viele verschiedene Train/Test-Splits (mit unterschiedlichen seeds)
    - test_size: Anteil Testdaten
    """
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    make_model = build_pipeline(clf)

    accuracies = []
    confusion_matrices = []

    classes = np.unique(y)

    for i in range(n_splits):
        rs = base_random_state + i
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=test_size,
            random_state=rs,
            stratify=y  # damit Klassenverteilung im Split erhalten bleibt
        )

        model = make_model(df)   # baut Pipeline mit passenden Spalten
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        acc = (y_pred == y_test).mean()
        accuracies.append(acc)

        cm = confusion_matrix(y_test, y_pred, labels=classes)
        confusion_matrices.append(cm)

    return {
        "classes": classes,
        "accuracies": np.array(accuracies),
        "confusion_matrices": confusion_matrices,
    }


In [5]:
# Synthetische Daten
synthetic_data = generate(data)
synth_results = evaluate_df(synthetic_data, n_splits=10)

# Originaldaten (data) aus CSV
orig_results = evaluate_df(data, n_splits=10)

print("Originaldaten:")
print("  Accuracy mean:", orig_results["accuracies"].mean())
print("  Accuracy std :", orig_results["accuracies"].std())
print("  Confusion Matrix (Split 0):")
print(orig_results["confusion_matrices"][0])
print("  Klassenreihenfolge:", orig_results["classes"])

print("\nSynthetische Daten:")
print("  Accuracy mean:", synth_results["accuracies"].mean())
print("  Accuracy std :", synth_results["accuracies"].std())
print("  Confusion Matrix (Split 0):")
print(synth_results["confusion_matrices"][0])
print("  Klassenreihenfolge:", synth_results["classes"])

synthetic_data.head()

(38000, 14)
Originaldaten:
  Accuracy mean: 0.8010131578947368
  Accuracy std : 0.003641695481967089
  Confusion Matrix (Split 0):
[[4964  541]
 [ 938 1157]]
  Klassenreihenfolge: [0 1]

Synthetische Daten:
  Accuracy mean: 0.8196578947368423
  Accuracy std : 0.003070162907242606
  Confusion Matrix (Split 0):
[[5056  450]
 [ 893 1201]]
  Klassenreihenfolge: [0 1]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0,0,3,2,4,0,0,0,0,1,7,1,6,1
1,0,0,4,2,2,0,0,0,0,0,2,1,5,0
2,0,0,6,2,9,0,0,1,0,2,5,1,1,0
3,1,1,2,0,3,1,0,1,0,1,7,1,5,0
4,0,0,2,6,1,1,0,1,0,0,5,1,3,0


In [6]:
'''
#version mit alternativem clf
from sklearn.tree import DecisionTreeClassifier

# Synthetische Daten
clf = DecisionTreeClassifier(random_state=42)
synthetic_data = generate(data)   # deine Funktion
synth_results = evaluate_df(synthetic_data, clf=clf, n_splits=10)

# Originaldaten (data) aus CSV
orig_results = evaluate_df(data, clf=clf, n_splits=10)

print("Originaldaten:")
print("  Accuracy mean:", orig_results["accuracies"].mean())
print("  Accuracy std :", orig_results["accuracies"].std())
print("  Confusion Matrix (Split 0):")
print(orig_results["confusion_matrices"][0])
print("  Klassenreihenfolge:", orig_results["classes"])

print("\nSynthetische Daten:")
print("  Accuracy mean:", synth_results["accuracies"].mean())
print("  Accuracy std :", synth_results["accuracies"].std())
print("  Confusion Matrix (Split 0):")
print(synth_results["confusion_matrices"][0])
print("  Klassenreihenfolge:", synth_results["classes"])
'''

(38000, 14)
Originaldaten:
  Accuracy mean: 0.7616315789473684
  Accuracy std : 0.004743124488867803
  Confusion Matrix (Split 0):
[[4716  789]
 [1015 1080]]
  Klassenreihenfolge: [0 1]

Synthetische Daten:
  Accuracy mean: 0.7530921052631581
  Accuracy std : 0.0032727204961596275
  Confusion Matrix (Split 0):
[[4550  957]
 [ 926 1167]]
  Klassenreihenfolge: [0 1]


In [10]:
def run_epsilon_study(datasets,          # dict: {"adult": df_adult, "bank": df_bank, ...}
                      epsilons,          # Liste der Epsilon-Werte
                      n_reps=10,
                      n_splits=10):
    """
    Führt für alle Datensätze und alle Epsilon-Werte eine Experimental-Studie durch.
    Gibt ein DataFrame mit Ergebnissen zurück.
    """
    results = []

    for ds_name, df in datasets.items():
        print(f"Dataset: {ds_name}")
        for eps in epsilons:
            for rep in range(n_reps):
                # synthetische Daten generieren
                synth = generate(df, epsilon=eps, random_state=42 + rep)

                # Klassifikations-Performance auf synthetischen Daten messen
                acc_synth = evaluate_df(synth, n_splits=n_splits)

                # Optional: Performance auf Originaldaten (Baseline)
                acc_orig = evaluate_df(df, n_splits=n_splits)

                results.append({
                    "dataset": ds_name,
                    "epsilon": eps,
                    "rep": rep,
                    "accuracy_synth": acc_synth,
                    "accuracy_orig": acc_orig
                })

    return pd.DataFrame(results)

acs = pd.read_csv('data\\acs_mrf.csv')
adult = pd.read_csv('data\\adult_mrf.csv')
br2000 = pd.read_csv('data\\br2000_mrf.csv')
# Beispiel-Setup:
datasets = {
    "acs": acs,
    "adult": adult,
    "br2000": br2000,
 }
epsilons = [0.01, 0.1, 0.2, 0.4, 0.8, 1.0, 2.0, 4.0]
study_results = run_epsilon_study(datasets, epsilons, n_reps=10, n_splits=10)


Dataset: acs
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
(47461, 23)
Dataset: adult
(45222, 15)


In [15]:

import ast

# Dict-Spalten -> mean-Accuracy-Spalten

def extract_mean_accuracy(x):
    # x kann schon ein dict oder ein String sein
    if isinstance(x, dict):
        d = x
    else:
        try:
            d = ast.literal_eval(x)
        except Exception:
            return np.nan

    accs = d.get("accuracies", None)
    if accs is None or len(accs) == 0:
        return np.nan

    return float(np.mean(accs))

study_results["acc_synth_mean"] = study_results["accuracy_synth"].apply(extract_mean_accuracy)
study_results["acc_orig_mean"]  = study_results["accuracy_orig"].apply(extract_mean_accuracy)

# Zusammenfassung pro Datensatz & Epsilon

summary = (
    study_results
    .groupby(["dataset", "epsilon"])
    .agg(
        mean_acc_synth=("acc_synth_mean", "mean"),
        std_acc_synth=("acc_synth_mean", "std"),
        mean_acc_orig=("acc_orig_mean", "mean"),
        std_acc_orig=("acc_orig_mean", "std"),
        n_runs=("rep", "nunique"),
    )
    .reset_index()
)

summary["diff_mean"] = summary["mean_acc_synth"] - summary["mean_acc_orig"]
summary = summary.sort_values(["dataset", "epsilon"])

# Kurz ausgeben & (optional) Pivot-Tabellen für weitere Auswertung 

print("=== Zusammenfassung pro Dataset & Epsilon ===")
print(summary)

pivot_synth = summary.pivot(index="epsilon", columns="dataset", values="mean_acc_synth")
pivot_diff  = summary.pivot(index="epsilon", columns="dataset", values="diff_mean")

print("\n=== Mittelwerte synthetische Accuracy (Zeilen: epsilon, Spalten: Datensätze) ===")
print(pivot_synth)

print("\n=== Mittelwerte Differenz (synthetic - original) ===")
print(pivot_diff)



=== Zusammenfassung pro Dataset & Epsilon ===
   dataset  epsilon  mean_acc_synth  std_acc_synth  mean_acc_orig  \
0      acs     0.01        0.960541       0.035032       0.935626   
1      acs     0.10        0.939149       0.004430       0.935626   
2      acs     0.20        0.938933       0.003146       0.935626   
3      acs     0.40        0.937477       0.001863       0.935626   
4      acs     0.80        0.937901       0.000850       0.935626   
5      acs     1.00        0.937059       0.001245       0.935626   
6      acs     2.00        0.937372       0.000966       0.935626   
7      acs     4.00        0.937624       0.000852       0.935626   
8    adult     0.01        0.941132       0.039721       0.838132   
9    adult     0.10        0.917034       0.008488       0.838132   
10   adult     0.20        0.915504       0.003143       0.838132   
11   adult     0.40        0.915158       0.004970       0.838132   
12   adult     0.80        0.914519       0.002295       