In [16]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
    --------------------------------------- 1.3/56.8 MB 13.4 MB/s eta 0:00:05
   ---- ----------------------------------- 6.8/56.8 MB 26.2 MB/s eta 0:00:02
   -------- ------------------------------- 12.6/56.8 MB 25.5 MB/s eta 0:00:02
   ------------ --------------------------- 18.4/56.8 MB 28.3 MB/s eta 0:00:02
   ------------- -------------------------- 19.1/56.8 MB 21.6 MB/s eta 0:00:02
   -------------- ------------------------- 20.2/56.8 MB 18.5 MB/s eta 0:00:02
   -------------- ------------------------- 21.0/56.8 MB 16.2 MB/s eta 0:00:03
   --------------- ------------------------ 21.8/56.8 MB 14.2 MB/s eta 0:00:03
   ------------------ --------------------- 26.7/56.8 MB 15.3 MB/s eta 0:00:02
   -------------------- ------------------- 29.6/56.8 MB 15.4 MB/s eta 0:0



In [3]:
# Cell 1: imports & configuration
import os, io, math, random
from pathlib import Path
import numpy as np
import cv2
from PIL import Image, ImageChops
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib

# ML
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

RANDOM_STATE = 42
DATASET_DIR = Path(r"C:\Users\mujta\OneDrive\Desktop\dataset")   # <- change if needed; expects dataset/real and dataset/fake
OUTPUT_DIR = Path("./outputs"); OUTPUT_DIR.mkdir(exist_ok=True)
IMAGE_SIZE = (224, 224)   # used for CNN embedding/transfer learning
ELA_QUALITY = 90

In [4]:
# Cell 2: feature extraction helpers
import scipy.stats as stats

def load_rgb(path, size=None):
    img = cv2.imread(str(path))
    if img is None:
        raise ValueError(f"Cannot open: {path}")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    if size is not None:
        img = cv2.resize(img, size, interpolation=cv2.INTER_AREA)
    return img

def compute_ela_pil(img_rgb, quality=ELA_QUALITY):
    # img_rgb: numpy RGB image uint8
    pil = Image.fromarray(img_rgb)
    buf = io.BytesIO()
    pil.save(buf, format='JPEG', quality=quality)
    buf.seek(0)
    compressed = Image.open(buf).convert("RGB")
    ela = ImageChops.difference(pil.convert("RGB"), compressed)
    # amplify to full 0-255
    extrema = ela.getextrema()
    max_diff = max([e[1] for e in extrema]) or 1
    scale = 255.0 / max_diff
    ela = ela.point(lambda i: int(min(255, i * scale)))
    ela_np = np.asarray(ela)
    return ela_np

def highpass_cv2(img_rgb):
    gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY).astype(np.float32)
    lap = cv2.Laplacian(gray, cv2.CV_32F)
    lap_abs = np.abs(lap)
    lap_norm = np.uint8(255 * lap_abs / (lap_abs.max() + 1e-9))
    kernel = np.array([[-1,-1,-1],[-1,8,-1],[-1,-1,-1]], np.float32)
    hp = cv2.filter2D(gray, -1, kernel)
    hp_abs = np.abs(hp)
    hp_norm = np.uint8(255 * hp_abs / (hp_abs.max() + 1e-9))
    return np.maximum(lap_norm, hp_norm)

def fft_features(img_rgb, top_k=20):
    gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY).astype(np.float32)
    f = np.fft.fft2(gray)
    fshift = np.fft.fftshift(f)
    magnitude = np.abs(fshift)
    log_mag = np.log1p(magnitude)
    mean = log_mag.mean()
    std = log_mag.std()
    skew = float(stats.skew(log_mag.reshape(-1)))
    kurt = float(stats.kurtosis(log_mag.reshape(-1)))
    h, w = log_mag.shape
    cy, cx = h//2, w//2
    r = min(cy, cx)
    Y, X = np.ogrid[:h, :w]
    dist = np.sqrt((Y - cy)**2 + (X - cx)**2)
    low_mask = dist <= (0.1*r)
    mid_mask = (dist > (0.1*r)) & (dist <= (0.4*r))
    high_mask = dist > (0.4*r)
    low_e = log_mag[low_mask].sum()/(low_mask.sum()+1e-9)
    mid_e = log_mag[mid_mask].sum()/(mid_mask.sum()+1e-9)
    high_e = log_mag[high_mask].sum()/(high_mask.sum()+1e-9)
    flat = log_mag.reshape(-1)
    topk = np.sort(flat)[-top_k:].sum()
    return np.array([mean, std, skew, kurt, low_e, mid_e, high_e, topk], dtype=np.float32)

# single-image feature extractor combining ELA, HP, FFT, simple stats
def extract_handcrafted_features(img_rgb, use_ela=True):
    feats = []
    if use_ela:
        try:
            ela = compute_ela_pil(img_rgb)
            ela_gray = cv2.cvtColor(ela, cv2.COLOR_RGB2GRAY) if ela.ndim==3 else ela
            feats += [float(ela_gray.mean()), float(ela_gray.std()), float(np.percentile(ela_gray,90))]
        except Exception as e:
            feats += [0.0, 0.0, 0.0]
    hp = highpass_cv2(img_rgb)
    feats += [float(hp.mean()), float(hp.std()), float(np.percentile(hp,90))]
    fftf = fft_features(img_rgb)
    feats = np.concatenate([np.array(feats, dtype=np.float32), fftf])
    return feats


In [5]:
# Cell 3: optional CNN embeddings using TensorFlow Keras (fast if you have GPU)
HAS_TF = False
try:
    import tensorflow as tf
    from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
    from tensorflow.keras.layers import GlobalAveragePooling2D
    from tensorflow.keras.models import Model
    HAS_TF = True
except Exception as e:
    print("TensorFlow not available or failed to import. CNN embeddings disabled.", e)

cnn_embedding_model = None
if HAS_TF:
    base = ResNet50(weights='imagenet', include_top=False, input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
    x = GlobalAveragePooling2D()(base.output)
    cnn_embedding_model = Model(inputs=base.input, outputs=x)
    print("Loaded ResNet50 for embeddings.")

def cnn_embed(img_rgb):
    if not HAS_TF or cnn_embedding_model is None:
        return np.zeros(2048, dtype=np.float32)
    img = cv2.resize(img_rgb, IMAGE_SIZE)
    arr = np.expand_dims(img.astype(np.float32), 0)
    arr = preprocess_input(arr)
    emb = cnn_embedding_model.predict(arr, verbose=0)
    return emb.reshape(-1)


Loaded ResNet50 for embeddings.


In [6]:
# Cell 4: build features and labels from dataset/real and dataset/fake
def build_features(dataset_dir=DATASET_DIR, use_cnn=HAS_TF, max_per_class=None, verbose=True):
    X, y, paths = [], [], []
    for label_name, label_idx in [('real',0), ('fake',1)]:
        folder = dataset_dir / label_name
        if not folder.exists():
            raise FileNotFoundError(f"Expected folder: {folder}")
        files = [p for p in folder.iterdir() if p.suffix.lower() in ('.jpg','.jpeg','.png')]
        if max_per_class:
            files = files[:max_per_class]
        if verbose:
            print(label_name, len(files), "images")
        for p in tqdm(files, desc=f"Processing {label_name}"):
            try:
                img = load_rgb(p, size=IMAGE_SIZE)  # resize to IMAGE_SIZE for speed; FFT works fine
                hf = extract_handcrafted_features(img, use_ela=True)
                if use_cnn:
                    emb = cnn_embed(img)
                    feats = np.concatenate([hf, emb])
                else:
                    feats = hf
                X.append(feats)
                y.append(label_idx)
                paths.append(str(p))
            except Exception as e:
                print("Error:", p, e)
    X = np.vstack(X).astype(np.float32)
    y = np.array(y, dtype=np.int32)
    return X, y, paths

# Run feature building
X, y, paths = build_features(DATASET_DIR, use_cnn=HAS_TF, max_per_class=None, verbose=True)
print("Feature matrix:", X.shape, "Labels:", y.shape)

real 475 images


Processing real: 100%|███████████████████████████████████████████████████████████████| 475/475 [01:12<00:00,  6.51it/s]


fake 475 images


Processing fake: 100%|███████████████████████████████████████████████████████████████| 475/475 [01:10<00:00,  6.70it/s]

Feature matrix: (950, 2062) Labels: (950,)





In [7]:
# Cell 5: train RandomForest classifier on features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

scaler = StandardScaler()
clf = RandomForestClassifier(n_estimators=250, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
pipe = Pipeline([('scaler', scaler), ('clf', clf)])

print("Training RandomForest...")
pipe.fit(X_train, y_train)

# Evaluate
y_pred = pipe.predict(X_test)
try:
    y_proba = pipe.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_proba)
    print("ROC AUC:", auc)
except Exception:
    y_proba = None

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['real','fake']))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Save
joblib.dump(pipe, OUTPUT_DIR/"rf_detector.joblib")
print("Saved RF model to outputs/rf_detector.joblib")

Training RandomForest...
ROC AUC: 0.41036011080332413
Accuracy: 0.4421052631578947
              precision    recall  f1-score   support

        real       0.44      0.43      0.44        95
        fake       0.44      0.45      0.45        95

    accuracy                           0.44       190
   macro avg       0.44      0.44      0.44       190
weighted avg       0.44      0.44      0.44       190

Confusion matrix:
 [[41 54]
 [52 43]]
Saved RF model to outputs/rf_detector.joblib


In [8]:
# Cell 6: Transfer learning model (MobileNetV2). Requires TensorFlow.
if HAS_TF:
    import tensorflow as tf
    from tensorflow.keras.preprocessing.image import ImageDataGenerator
    from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
    from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
    from tensorflow.keras.models import Model
    from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

    # Create simple ImageDataGenerators using folders (will use DATASET_DIR with subfolders real/fake)
    batch_size = 16
    train_gen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                   validation_split=0.2,
                                   rotation_range=5,
                                   width_shift_range=0.02,
                                   height_shift_range=0.02,
                                   brightness_range=(0.9,1.1),
                                   horizontal_flip=False)

    train_flow = train_gen.flow_from_directory(str(DATASET_DIR), target_size=IMAGE_SIZE,
                                               batch_size=batch_size, class_mode='binary', subset='training', shuffle=True, seed=RANDOM_STATE)
    val_flow = train_gen.flow_from_directory(str(DATASET_DIR), target_size=IMAGE_SIZE,
                                             batch_size=batch_size, class_mode='binary', subset='validation', shuffle=False, seed=RANDOM_STATE)

    base = MobileNetV2(weights='imagenet', include_top=False, input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
    x = GlobalAveragePooling2D()(base.output)
    x = Dropout(0.3)(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(base.input, out)

    # Freeze base first
    for layer in base.layers:
        layer.trainable = False

    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])
    ckpt = ModelCheckpoint(str(OUTPUT_DIR/'mobilenet_finetune.h5'), save_best_only=True, monitor='val_loss')
    es = EarlyStopping(patience=6, restore_best_weights=True)

    steps = math.ceil(train_flow.samples / batch_size)
    vsteps = math.ceil(val_flow.samples / batch_size)
    print("Training MobileNetV2 (frozen base)...")
    model.fit(train_flow, epochs=12, validation_data=val_flow, steps_per_epoch=steps, validation_steps=vsteps, callbacks=[ckpt, es])

    # Unfreeze last blocks and fine-tune
    for layer in base.layers[-40:]:
        layer.trainable = True
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    print("Fine-tuning last layers...")
    model.fit(train_flow, epochs=12, validation_data=val_flow, steps_per_epoch=steps, validation_steps=vsteps, callbacks=[ckpt, es])

    # Save final model
    model.save(str(OUTPUT_DIR/'mobilenet_final.h5'))
    print("Saved MobileNet model to outputs/mobilenet_final.h5")

else:
    print("TensorFlow not installed, skipping transfer-learning CNN.")


Found 760 images belonging to 2 classes.
Found 190 images belonging to 2 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Training MobileNetV2 (frozen base)...


  self._warn_if_super_not_called()


Epoch 1/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step - accuracy: 0.4585 - loss: 0.8521



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 387ms/step - accuracy: 0.4590 - loss: 0.8513 - val_accuracy: 0.4316 - val_loss: 0.7571
Epoch 2/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 384ms/step - accuracy: 0.5681 - loss: 0.7090 - val_accuracy: 0.5053 - val_loss: 0.7976
Epoch 3/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 395ms/step - accuracy: 0.5550 - loss: 0.7263 - val_accuracy: 0.4789 - val_loss: 0.7763
Epoch 4/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step - accuracy: 0.5760 - loss: 0.6895 



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 404ms/step - accuracy: 0.5757 - loss: 0.6897 - val_accuracy: 0.4632 - val_loss: 0.7434
Epoch 5/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 417ms/step - accuracy: 0.5591 - loss: 0.7075 - val_accuracy: 0.5211 - val_loss: 0.7516
Epoch 6/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step - accuracy: 0.5923 - loss: 0.6953 



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 404ms/step - accuracy: 0.5923 - loss: 0.6950 - val_accuracy: 0.5105 - val_loss: 0.7384
Epoch 7/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 405ms/step - accuracy: 0.5498 - loss: 0.6802 - val_accuracy: 0.5000 - val_loss: 0.8018
Epoch 8/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 405ms/step - accuracy: 0.6032 - loss: 0.6578 - val_accuracy: 0.5211 - val_loss: 0.8440
Epoch 9/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 381ms/step - accuracy: 0.5970 - loss: 0.6421 - val_accuracy: 0.4737 - val_loss: 0.7614
Epoch 10/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308ms/step - accuracy: 0.5471 - loss: 0.6811 



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 400ms/step - accuracy: 0.5479 - loss: 0.6806 - val_accuracy: 0.5211 - val_loss: 0.7345
Epoch 11/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 409ms/step - accuracy: 0.6034 - loss: 0.6549 - val_accuracy: 0.5211 - val_loss: 0.7868
Epoch 12/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 403ms/step - accuracy: 0.6424 - loss: 0.6226 - val_accuracy: 0.5211 - val_loss: 0.8054
Fine-tuning last layers...
Epoch 1/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 514ms/step - accuracy: 0.5257 - loss: 0.7788 - val_accuracy: 0.4632 - val_loss: 0.7357
Epoch 2/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 470ms/step - accuracy: 0.5660 - loss: 0.7148 - val_accuracy: 0.4684 - val_loss: 0.7422
Epoch 3/12
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 501ms/step - accuracy: 0.6059 - loss: 0.6870 - val_accuracy: 0.5105 - val_loss: 0.7409
Epoc



Saved MobileNet model to outputs/mobilenet_final.h5


In [13]:
# Cell 7: ensemble / predict helper
def predict_image_with_rf(img_path, rf_pipe=pipe):
    img = load_rgb(img_path, size=IMAGE_SIZE)
    feats = extract_handcrafted_features(img, use_ela=True)
    if HAS_TF:
        emb = cnn_embed(img)
        feats = np.concatenate([feats, emb])
    feats = feats.reshape(1, -1)
    pred = rf_pipe.predict(feats)[0]
    proba = rf_pipe.predict_proba(feats)[0,1]
    return pred, proba

# Example:
sample = Path(r"C:\Users\mujta\OneDrive\Desktop\00000888_in.jpg")
print("Sample path:", sample)
p, prob = predict_image_with_rf(sample)
print("RF prediction:", "fake" if p==1 else "real", "prob(fake)=", prob)


Sample path: C:\Users\mujta\OneDrive\Desktop\00000888_in.jpg
RF prediction: real prob(fake)= 0.476


In [14]:
# ======================================================
# Brazilian ID Fraud Detection (CPU-friendly)
# Features: ELA + FFT + High-pass + ResNet50 embeddings
# Classifier: Logistic Regression
# ======================================================

import os, io, cv2
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib

# Sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score
)

# TensorFlow / Keras for ResNet
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model

# PIL for ELA
from PIL import Image, ImageChops

# -----------------------------
# Paths
# -----------------------------
DATASET_DIR = Path(r"C:\Users\mujta\OneDrive\Desktop\dataset")   # Change if needed
real_dir = Path(DATASET_DIR) / "real"
fake_dir = Path(DATASET_DIR) / "fake"
os.makedirs("outputs", exist_ok=True)

# -----------------------------
# Feature extractors
# -----------------------------

def extract_ela_features(img_path, quality=90, size=(128,128)):
    try:
        img = Image.open(img_path).convert("RGB")
    except:
        return np.zeros(3)

    # Save to JPEG in memory
    buf = io.BytesIO()
    img.save(buf, "JPEG", quality=quality)
    buf.seek(0)
    compressed = Image.open(buf)

    ela_img = ImageChops.difference(img, compressed)
    ela_img = ela_img.resize(size)
    ela_gray = np.array(ela_img.convert("L"))

    return np.array([
        ela_gray.mean(),
        ela_gray.std(),
        np.percentile(ela_gray, 95)
    ], dtype=np.float32)

def extract_fft_features(img_path, size=(128,128)):
    try:
        img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, size)
    except:
        return np.zeros(3)

    f = np.fft.fft2(img)
    fshift = np.fft.fftshift(f)
    magnitude = 20*np.log(np.abs(fshift) + 1)

    return np.array([
        magnitude.mean(),
        magnitude.std(),
        np.percentile(magnitude, 95)
    ], dtype=np.float32)

def extract_highpass_features(img_path, size=(128,128)):
    try:
        img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, size)
    except:
        return np.zeros(3)

    lap = cv2.Laplacian(img, cv2.CV_64F)
    return np.array([
        lap.mean(),
        lap.std(),
        np.percentile(lap, 95)
    ], dtype=np.float32)

# -----------------------------
# ResNet50 feature extractor
# -----------------------------
base_model = ResNet50(weights="imagenet", include_top=False, pooling="avg")
model = Model(inputs=base_model.input, outputs=base_model.output)

def extract_resnet_embedding(img_path, target_size=(224,224)):
    try:
        img = image.load_img(img_path, target_size=target_size)
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        feat = model.predict(x, verbose=0)
        return feat.flatten()
    except:
        return np.zeros(model.output_shape[1], dtype=np.float32)

# -----------------------------
# Build dataset
# -----------------------------
X, y = [], []

def process_dir(directory, label):
    for p in tqdm(list(Path(directory).glob("*"))):
        try:
            f_ela = extract_ela_features(p)
            f_fft = extract_fft_features(p)
            f_hp  = extract_highpass_features(p)
            f_res = extract_resnet_embedding(p)
            features = np.concatenate([f_ela, f_fft, f_hp, f_res])
            X.append(features)
            y.append(label)
        except Exception as e:
            print("⚠️ Error with:", p, e)

print("Processing real images...")
process_dir(real_dir, 0)
print("Processing fake images...")
process_dir(fake_dir, 1)

X, y = np.array(X), np.array(y)
print("Final feature matrix:", X.shape)

# -----------------------------
# Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# -----------------------------
# Train classifier
# -----------------------------
clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

# -----------------------------
# Evaluate
# -----------------------------
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

print("\n--- Evaluation ---")
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Accuracy:", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# -----------------------------
# Save model
# -----------------------------
joblib.dump(clf, "outputs/fraud_detector.joblib")
print("\n✅ Saved model to outputs/fraud_detector.joblib")


Processing real images...


100%|████████████████████████████████████████████████████████████████████████████████| 475/475 [01:00<00:00,  7.88it/s]


Processing fake images...


100%|████████████████████████████████████████████████████████████████████████████████| 475/475 [01:13<00:00,  6.47it/s]


Final feature matrix: (950, 2057)

--- Evaluation ---
ROC AUC: 0.6072096917167339
Accuracy: 0.5508771929824562
              precision    recall  f1-score   support

           0       0.55      0.59      0.57       143
           1       0.55      0.51      0.53       142

    accuracy                           0.55       285
   macro avg       0.55      0.55      0.55       285
weighted avg       0.55      0.55      0.55       285

Confusion matrix:
[[84 59]
 [69 73]]

✅ Saved model to outputs/fraud_detector.joblib


In [20]:
# ======================================================
# Brazilian ID Fraud Detection - Full Pipeline (Fixed)
# ======================================================

import os, cv2, io
import numpy as np
from tqdm import tqdm
from pathlib import Path
from PIL import Image, ImageChops, ImageEnhance

# ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
from xgboost import XGBClassifier

# Torch for embeddings
import torch
import torchvision.models as models
import torchvision.transforms as transforms

# -----------------------------
# Config
# -----------------------------
dataset_path = Path(r"C:\Users\mujta\OneDrive\Desktop\dataset")   # inside: dataset/real , dataset/fake
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)

IMG_SIZE = (128, 128)  # fixed size for feature extraction

# -----------------------------
# Image Processing Functions
# -----------------------------
def ela_features(image_path, quality=90):
    """Error Level Analysis with fixed size"""
    image = Image.open(image_path).convert("RGB").resize(IMG_SIZE)
    buffer = io.BytesIO()
    image.save(buffer, "JPEG", quality=quality)
    compressed = Image.open(buffer)
    ela_img = ImageChops.difference(image, compressed)
    extrema = ela_img.getextrema()
    max_diff = max([ex[1] for ex in extrema])
    scale = 255.0 / max_diff if max_diff != 0 else 1
    ela_img = ImageEnhance.Brightness(ela_img).enhance(scale)
    return np.array(ela_img).flatten()

def fft_features(image_path):
    """FFT magnitude spectrum stats (fixed size)"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None: 
        return np.zeros(5)
    img = cv2.resize(img, IMG_SIZE)
    f = np.fft.fft2(img)
    fshift = np.fft.fftshift(f)
    magnitude = 20*np.log(np.abs(fshift) + 1)
    return np.array([
        np.mean(magnitude),
        np.std(magnitude),
        np.median(magnitude),
        np.max(magnitude),
        np.min(magnitude)
    ])

def highpass_features(image_path):
    """High-pass filter features (fixed size)"""
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None: 
        return np.zeros(5)
    img = cv2.resize(img, IMG_SIZE)
    dft = cv2.dft(np.float32(img), flags=cv2.DFT_COMPLEX_OUTPUT)
    dft_shift = np.fft.fftshift(dft)
    rows, cols = img.shape
    crow, ccol = rows//2, cols//2
    mask = np.ones((rows, cols, 2), np.uint8)
    mask[crow-10:crow+10, ccol-10:ccol+10] = 0
    fshift = dft_shift * mask
    f_ishift = np.fft.ifftshift(fshift)
    img_back = cv2.idft(f_ishift)
    img_back = cv2.magnitude(img_back[:,:,0], img_back[:,:,1])
    return np.array([
        np.mean(img_back),
        np.std(img_back),
        np.median(img_back),
        np.max(img_back),
        np.min(img_back)
    ])

# -----------------------------
# ResNet Embeddings
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet.fc = torch.nn.Identity()  # remove classifier
resnet = resnet.to(device)
resnet.eval()

transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
])

def deep_features(image_path):
    img = Image.open(image_path).convert("RGB").resize((224,224))
    tensor = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = resnet(tensor).cpu().numpy().flatten()
    return feat

# -----------------------------
# Feature Extraction
# -----------------------------
def extract_features(folder, label):
    features, labels = [], []
    files = list(Path(folder).glob("*_in.jpg"))
    if len(files) == 0:
        print(f"⚠️ No images found in {folder}")
        return np.empty((0,)), np.empty((0,))

    for f in tqdm(files, desc=f"Processing {label} images"):
        f = str(f)
        try:
            feat_ela = ela_features(f)
            feat_fft = fft_features(f)
            feat_hp  = highpass_features(f)
            feat_res = deep_features(f)
            all_feat = np.concatenate([feat_ela, feat_fft, feat_hp, feat_res])
            features.append(all_feat)
            labels.append(label)
        except Exception as e:
            print(f"⚠️ Skipped {f}: {e}")
            continue

    return np.array(features), np.array(labels)

# -----------------------------
# Load Dataset
# -----------------------------
X_real, y_real = extract_features(os.path.join(dataset_path, "real"), 0)
X_fake, y_fake = extract_features(os.path.join(dataset_path, "fake"), 1)

print("Real:", X_real.shape, "Fake:", X_fake.shape)

X = np.vstack([X_real, X_fake])
y = np.concatenate([y_real, y_fake])

print("Final feature matrix:", X.shape)

# -----------------------------
# Train/Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# -----------------------------
# Train XGBoost
# -----------------------------
clf = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    n_jobs=-1
)

clf.fit(X_train, y_train)

# -----------------------------
# Evaluation
# -----------------------------
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

print("\n--- Evaluation (XGBoost) ---")
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Accuracy:", clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

# -----------------------------
# Save Model
# -----------------------------
joblib.dump(clf, os.path.join(output_dir, "xgb_detector.joblib"))
print("\n✅ Saved XGBoost model to outputs/xgb_detector.joblib")

# -----------------------------
# Predict Single Image
# -----------------------------
def predict_image(image_path, model_path="outputs/xgb_detector.joblib"):
    model = joblib.load(model_path)
    feat_ela = ela_features(image_path)
    feat_fft = fft_features(image_path)
    feat_hp  = highpass_features(image_path)
    feat_res = deep_features(image_path)
    all_feat = np.concatenate([feat_ela, feat_fft, feat_hp, feat_res]).reshape(1,-1)
    pred = model.predict(all_feat)[0]
    proba = model.predict_proba(all_feat)[0][1]
    label = "FAKE" if pred == 1 else "REAL"
    print(f"Prediction: {label} (prob={proba:.3f})")
    return label, proba

# Example usage:
# predict_image(r"C:\Users\mujta\OneDrive\Desktop\dataset\fake\0001_in.jpg")


Processing 0 images: 100%|███████████████████████████████████████████████████████████| 475/475 [00:45<00:00, 10.40it/s]
Processing 1 images: 100%|███████████████████████████████████████████████████████████| 475/475 [00:52<00:00,  9.08it/s]


Real: (475, 49674) Fake: (475, 49674)
Final feature matrix: (950, 49674)

--- Evaluation (XGBoost) ---
ROC AUC: 0.4576972323451196
Accuracy: 0.4666666666666667
              precision    recall  f1-score   support

           0       0.47      0.48      0.48       143
           1       0.46      0.45      0.46       142

    accuracy                           0.47       285
   macro avg       0.47      0.47      0.47       285
weighted avg       0.47      0.47      0.47       285

Confusion matrix:
[[69 74]
 [78 64]]

✅ Saved XGBoost model to outputs/xgb_detector.joblib


In [None]:
# ======================================================
# Hyperparameter Tuning for XGBoost
# ======================================================

from sklearn.model_selection import RandomizedSearchCV

# Define parameter search space
param_dist = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [3, 4, 5, 6, 8],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.3]
}

clf = XGBClassifier(
    random_state=42,
    tree_method="hist",
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="logloss"
)

# RandomizedSearchCV (faster than full grid)
random_search = RandomizedSearchCV(
    clf,
    param_distributions=param_dist,
    n_iter=10,              # fewer candidates
    scoring="roc_auc",
    cv=2,                   # fewer CV folds
    verbose=2,
    random_state=42,
    n_jobs=2                # limit parallel jobs
)

print("\n🔍 Running hyperparameter search...")
random_search.fit(X_train, y_train)

# Best model
best_clf = random_search.best_estimator_
print("\n✅ Best parameters found:", random_search.best_params_)

# Save tuned model
joblib.dump(best_clf, os.path.join(output_dir, "xgb_tuned.joblib"))
print("✅ Saved tuned model to outputs/xgb_tuned.joblib")

# -----------------------------
# Evaluate Best Model
# -----------------------------
y_pred = best_clf.predict(X_test)
y_proba = best_clf.predict_proba(X_test)[:,1]

print("\n--- Evaluation (Tuned XGBoost) ---")
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Accuracy:", best_clf.score(X_test, y_test))
print(classification_report(y_test, y_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))



🔍 Running hyperparameter search...
Fitting 2 folds for each of 10 candidates, totalling 20 fits
