In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install scikit-image pywavelets umap-learn

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import pywt
from skimage.feature import graycomatrix, graycoprops
from skimage.measure import regionprops, label
from scipy.stats import skew, kurtosis
import umap
from sklearn.metrics.pairwise import cosine_distances
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, accuracy_score, precision_score, recall_score
)
from sklearn.covariance import EmpiricalCovariance

In [None]:
TRAIN_GEN_DIR    = "/kaggle/input/gan-train-data/ImageCLEF25_GAN_Detect_Training_Data_Usage_training-dataset/generated"
TRAIN_REAL_USED  = "/kaggle/input/gan-train-data/ImageCLEF25_GAN_Detect_Training_Data_Usage_training-dataset/real_used"
TRAIN_REAL_UNUSED= "/kaggle/input/gan-train-data/ImageCLEF25_GAN_Detect_Training_Data_Usage_training-dataset/real_not_used"
TEST_REAL_UNKNOWN= "/kaggle/input/gan-test-data/real_unknown"
TEST_GEN_DIR     = "/kaggle/input/gan-test-data/generated"

IMG_SIZE = (128,128)

def load_and_preprocess(folder):
    imgs = []
    for fn in os.listdir(folder):
        im = load_img(os.path.join(folder, fn), target_size=IMG_SIZE)
        im = img_to_array(im)/255.0
        imgs.append(im.astype(np.float32))
    return np.stack(imgs)


In [None]:
X_synth_train = load_and_preprocess(TRAIN_GEN_DIR)
X_real_used   = load_and_preprocess(TRAIN_REAL_USED)
X_real_unused = load_and_preprocess(TRAIN_REAL_UNUSED)
X_real_unknown= load_and_preprocess(TEST_REAL_UNKNOWN)

autoencoder = load_model('/kaggle/input/resnet_encoder_2/keras/default/1/autoencoder_model_test_train.keras')
encoder = Model(
    inputs=autoencoder.input,
    outputs=autoencoder.get_layer('latent').output
)

In [None]:
def extract_radiomic_features(img):
    gray = cv2.cvtColor((img*255).astype(np.uint8), cv2.COLOR_BGR2GRAY)
    vals = gray.ravel()
    feats = {
        'mean': np.mean(vals), 'var': np.var(vals),
        'skew': skew(vals), 'kurtosis': kurtosis(vals)
    }
    glcm = graycomatrix(gray, distances=[1],
                        angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
                        levels=256, symmetric=True, normed=True)
    for prop in ['contrast','correlation','energy','homogeneity']:
        arr = graycoprops(glcm, prop).ravel()
        for i,v in enumerate(arr):
            feats[f'glcm_{prop}_{i}'] = v
    return feats

def extract_gabor_features(img, frequencies=[0.1,0.3,0.5], thetas=[0, np.pi/4, np.pi/2]):
    """Apply a bank of Gabor filters and return mean & var of responses."""
    gray = cv2.cvtColor((img*255).astype(np.uint8), cv2.COLOR_BGR2GRAY)
    feats = {}
    for freq in frequencies:
        for theta in thetas:
            kern = cv2.getGaborKernel((21,21), sigma=4.0, theta=theta,
                                      lambd=1/freq, gamma=0.5, psi=0)
            resp = cv2.filter2D(gray, cv2.CV_32F, kern)
            feats[f'gabor_mean_f{freq:.2f}_t{theta:.2f}'] = np.mean(resp)
            feats[f'gabor_var_f{freq:.2f}_t{theta:.2f}']  = np.var(resp)
    return feats

def extract_wavelet_features(img, wavelet='db1', level=2):
    """Decompose to wavelet subbands and return energy of each."""
    gray = cv2.cvtColor((img*255).astype(np.uint8), cv2.COLOR_BGR2GRAY)
    coeffs = pywt.wavedec2(gray, wavelet=wavelet, level=level)
    feats = {}
    # coeffs[0] is approximation; coeffs[1:] are details
    feats['wavelet_energy_approx'] = np.sum(coeffs[0]**2)
    for i,(cH,cV,cD) in enumerate(coeffs[1:], 1):
        feats.update({
            f'wavelet_energy_level{i}_h': np.sum(cH**2),
            f'wavelet_energy_level{i}_v': np.sum(cV**2),
            f'wavelet_energy_level{i}_d': np.sum(cD**2),
        })
    return feats

def extract_morphological_features(img, thresh=0.5):
    """Binary threshold + connected components stats on lung‐like regions."""
    gray = cv2.cvtColor((img*255).astype(np.uint8), cv2.COLOR_BGR2GRAY)
    # simple Otsu threshold
    _, bw = cv2.threshold(gray, 0, 1, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    lbl = label(bw)
    props = regionprops(lbl)
    # area mean/std of all components
    areas = [p.area for p in props]
    return {
        'comp_count': len(areas),
        'area_mean':  np.mean(areas) if areas else 0,
        'area_std':   np.std(areas)  if areas else 0
    }

In [None]:
def extract_all_features(img):
    feats = {}
    feats.update(extract_radiomic_features(img))
    feats.update(extract_gabor_features(img))
    feats.update(extract_wavelet_features(img))
    feats.update(extract_morphological_features(img))
    return feats

In [None]:
# stack used + unused
X_real = np.concatenate([X_real_used, X_real_unused], axis=0)
y_real = np.array([1]*len(X_real_used) + [0]*len(X_real_unused))

# latent vectors
latents = encoder.predict(X_real, batch_size=32)
df_latent = pd.DataFrame(latents, columns=[f'latent_{i}' for i in range(latents.shape[1])])

# handcrafted features
hc_feats = [extract_all_features(img) for img in X_real]
df_hc = pd.DataFrame(hc_feats)

# combine
F_real = pd.concat([df_latent, df_hc], axis=1)

In [None]:
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(F_real, y_real)

importances = pd.Series(clf.feature_importances_, index=F_real.columns)
top20 = importances.sort_values(ascending=False).head(20)
print("Top-20 features:\n", top20)

In [None]:
# e.g. violin plots for the top 5
import matplotlib.pyplot as plt
for i, feat in enumerate(top20.index[:5]):
    plt.subplot(1,5,i+1)
    data = [F_real.loc[y_real==1, feat], F_real.loc[y_real==0, feat]]
    plt.violinplot(data)
    plt.title(feat)
    plt.xticks([1,2], ['used','not'])
plt.tight_layout()
plt.show()

# UMAP on top 10
um = umap.UMAP(n_components=2, random_state=42)
emb = um.fit_transform(F_real[top20.index[:10]])
plt.scatter(emb[:,0], emb[:,1], c=y_real, cmap='coolwarm', alpha=0.7)
plt.show()

In [None]:
# Compute reference distribution on ALL syn data
all_syn = np.concatenate([
    X_synth_train,
    load_and_preprocess(TEST_GEN_DIR)
], axis=0)
syn_feats = [extract_all_features(img) for img in all_syn]
syn_lat  = encoder.predict(all_syn, batch_size=32)
df_syn = pd.concat([
    pd.DataFrame(syn_lat, columns=df_latent.columns),
    pd.DataFrame(syn_feats)
], axis=1)

# Fit Mahalanobis on top-k features
cov = EmpiricalCovariance().fit(df_syn[top20.index])
def maha(x): return cov.mahalanobis(x[top20.index].values.reshape(1,-1))[0]

# Prepare unknowns
df_lat_u = pd.DataFrame(encoder.predict(X_real_unknown, batch_size=32),
                        columns=df_latent.columns)
hc_u = pd.DataFrame([extract_all_features(img) for img in X_real_unknown])
F_unknown = pd.concat([df_lat_u, hc_u], axis=1)

# meta-features: reconstruction error, latent-dist, maha
recon_u = np.mean((X_real_unknown - autoencoder.predict(X_real_unknown))**2,
                  axis=(1,2,3))
ld_u    = cosine_distances(df_lat_u, df_syn[df_latent.columns]).min(axis=1)
ma_u    = np.array([maha(F_unknown.iloc[i]) for i in range(len(F_unknown))])

meta_u = pd.DataFrame({'recon':recon_u, 'latent_dist':ld_u, 'maha':ma_u})

meta_real = pd.DataFrame({
    'recon': np.mean((X_real - autoencoder.predict(X_real))**2, axis=(1,2,3)),
    'latent_dist': cosine_distances(df_latent, df_syn[df_latent.columns]).min(axis=1),
    'maha': [maha(F_real.iloc[i]) for i in range(len(F_real))]
})
meta_clf = RandomForestClassifier(n_estimators=100, random_state=42)
meta_clf.fit(meta_real, y_real)

# predict on unknowns
y_pred_unknown = meta_clf.predict(meta_u)
print("Predicted labels for real_unknown:", y_pred_unknown)

Predicted labels for real_unknown: [0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1
 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0
 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0]

In [None]:
import re

def natural_key(fname):
    parts = re.split(r'(\d+)', fname)
    return [int(p) if p.isdigit() else p.lower() for p in parts]

fnames = sorted(os.listdir(TEST_REAL_UNKNOWN), key=natural_key)
rows = list(zip(fnames, y_pred_unknown))

df_sub = pd.DataFrame(rows)
df_sub.to_csv('run.csv', index=False, header=False)

# Model Evaluation
* Since the classifier is trained on the entire training data (200 real images), it cannot again be validated on it.
* I will use the following to validate it:
    * Cross-validation
    * Hold-out validation

# 1. Hold-out

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, cohen_kappa_score

# 1. Split the 200 examples
X_tr, X_va, y_tr, y_va = train_test_split(
    meta_real, y_real,
    test_size=0.3,       # 30% hold-out
    stratify=y_real,     # keep the same used/not ratio
    random_state=42
)

# 2. Train on 140 examples
meta_clf.fit(X_tr, y_tr)

# 3. Evaluate on 60 unseen examples
y_pred_va = meta_clf.predict(X_va)
print("Hold-out validation:")
print(" F1 Score      :", f1_score(y_va, y_pred_va))
print(" Cohen's Kappa :", cohen_kappa_score(y_va, y_pred_va))


# 2. Cross-validation (5-fold)

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, cohen_kappa_score

# Wrap Cohen's Kappa so cross_val_score can use it
kappa_scorer = make_scorer(cohen_kappa_score)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# F1
f1_scores = cross_val_score(
    meta_clf, meta_real, y_real,
    cv=cv, scoring='f1'
)

# Cohen's Kappa
kappa_scores = cross_val_score(
    meta_clf, meta_real, y_real,
    cv=cv, scoring=kappa_scorer
)

print("5-Fold CV results:")
print(f" F1 Score mean      : {f1_scores.mean():.4f} ± {f1_scores.std():.4f}")
print(f" Cohen's Kappa mean : {kappa_scores.mean():.4f} ± {kappa_scores.std():.4f}")
