In [None]:
# notebooks/data_exploration.ipynb
import os
import sys

# Projektbasisverzeichnis zum sys.path hinzufügen
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(BASE_DIR)

import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
from src.config import TRAIN_IMAGE_DIR

# Pfade definieren
data_dir = os.path.join(BASE_DIR, "data")
output_dir = os.path.join(BASE_DIR, "output")

train_csv_path = os.path.join(data_dir, "train.csv")
profiling_output_path = os.path.join(output_dir, "train_data_profiling_report.html")

# CSV laden
df = pd.read_csv(train_csv_path)

# Überblick
print("Form des Datensatzes:", df.shape)
print("Spalten:", df.columns.tolist())
print(df.head())

# Labelverteilung
label_counts = df['label'].value_counts()
print("\nLabelverteilung:")
print(label_counts)

# Visualisierung
label_counts.plot(kind='bar', title='Verteilung der Labels (0 = real, 1 = KI)')
plt.xlabel("Label")
plt.ylabel("Anzahl")
plt.tight_layout()
plt.show()

# Profiling Report erstellen
profile = ProfileReport(df, title="Trainingsdaten - Profiling Report", explorative=True)
profile.to_file(profiling_output_path)

print(f"Profiling-Report gespeichert unter: {profiling_output_path}")


In [None]:
# --- Visuelle Vorschau von echten und KI-generierten Bildern ---

from PIL import Image
from src.config import TRAIN_IMAGE_DIR

def load_image(filename):
    path = os.path.join(TRAIN_IMAGE_DIR, os.path.basename(filename))
    return Image.open(path).convert("RGB")

# 5 Beispiele je Klasse
real_samples = df[df['label'] == 0].head(5)
fake_samples = df[df['label'] == 1].head(5)

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
fig.suptitle("Oben: Reale Bilder | Unten: KI-generierte Bilder", fontsize=14)

for i in range(5):
    # Real
    img_real = load_image(real_samples.iloc[i]["file_name"])
    axes[0, i].imshow(img_real)
    axes[0, i].axis("off")

    # Fake
    img_fake = load_image(fake_samples.iloc[i]["file_name"])
    axes[1, i].imshow(img_fake)
    axes[1, i].axis("off")

plt.tight_layout()
plt.subplots_adjust(top=0.85)
plt.show()


In [None]:
import pandas as pd
from src.config import SUBSET_CSV

df = pd.read_csv(SUBSET_CSV)

print("Gesamte Zeilen:", len(df))

# Prüfe Anzahl tatsächlicher Paare (label==1 gefolgt von label==0)
valid_pairs = 0
for i in range(0, len(df) - 1, 2):
    l1, l2 = df.iloc[i]["label"], df.iloc[i + 1]["label"]
    if l1 == 1 and l2 == 0:
        valid_pairs += 1
    else:
        print(f"❌ Ungültiges Paar an Zeile {i}: ({l1}, {l2})")

print("✅ Gültige Paare:", valid_pairs)
