<a href="https://colab.research.google.com/github/lujain618/Deepfake_Detection_With_XAI/blob/main/DatasetLoading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import necessary libriries
import os
import cv2
import random
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
# make Balanced_Dataset.csv folder with 890 videos from Celeb_synthesis, 590 Celeb_real and 300 Youtube_real

In [None]:
# get dataset directory
dataset_dir = "/content/drive/MyDrive/DeepfakeDataset"
celeb_synthesis_dir = os.path.join(dataset_dir, "Celeb-synthesis")
celeb_real_dir = os.path.join(dataset_dir, "Celeb-real")
youtube_real_dir = os.path.join(dataset_dir, "YouTube-real")
real_videos = [os.path.join(celeb_real_dir, video) for video in os.listdir(celeb_real_dir)]
real_videos += [os.path.join(youtube_real_dir, video) for video in os.listdir(youtube_real_dir)]
fake_videos = [os.path.join(celeb_synthesis_dir, video) for video in os.listdir(celeb_synthesis_dir)]
fake_videos = random.sample(fake_videos, 890)  # randomly select 890 fake videos
# create labels: 0 for real, 1 for fake
real_labels = [0] * len(real_videos)
fake_labels = [1] * len(fake_videos)
# combine into a single dataset
video_paths = real_videos + fake_videos
labels = real_labels + fake_labels
# create a Pandas DataFrame
df = pd.DataFrame({'video_path': video_paths, 'label': labels})
# save to CSV file
csv_filename = '/content/drive/MyDrive/DeepfakeDataset/Balanced_dataset.csv'
df.to_csv(csv_filename, index=False)
print(f"CSV file '{csv_filename}' created successfully with {len(df)} entries.")

CSV file '/content/drive/MyDrive/DeepfakeDataset/Balanced_dataset.csv' created successfully with 1780 entries.


In [None]:
# model selcetion: split the dataset to 70% train, 20% val and 10% test

In [None]:
# Load dataset
csv_path = "Balanced_dataset.csv"
df = pd.read_csv(csv_path)

# Split into training (70%) and temp set (30%)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)

# Further split temp set into validation (20% of total) and test (10% of total)
val_df, test_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['label'], random_state=42)

# Print dataset sizes
print(f"Training set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"Test set: {len(test_df)} samples")

# Save the splits
train_df.to_csv("/content/drive/MyDrive/DeepfakeDataset/train.csv", index=False)
val_df.to_csv("/content/drive/MyDrive/DeepfakeDataset/val.csv", index=False)
test_df.to_csv("/content/drive/MyDrive/DeepfakeDataset/test.csv", index=False)

print("Splits saved as train.csv, val.csv, and test.csv")

Training set: 1246 samples
Validation set: 356 samples
Test set: 178 samples
Splits saved as train.csv, val.csv, and test.csv


In [None]:
# Extracting 30 frames from each video and resizing them to 112x112

In [None]:
# Load train.csv
df = pd.read_csv("/content/drive/MyDrive/DeepfakeDataset/train.csv")

# Create output directory
output_dir = "/content/drive/MyDrive/DeepfakeDataset/train_frames"
os.makedirs(output_dir, exist_ok=True)

def extract_frames(video_path, save_path, num_frames=30):
    cap = cv2.VideoCapture(video_path)
    count = 0
    frames = []

    while cap.isOpened() and count < num_frames:
        ret, frame = cap.read()
        if not ret:
            break

        # Resize frame to 112x112
        frame = cv2.resize(frame, (112, 112))

        # Save frame
        frame_filename = os.path.join(save_path, f"frame_{count:02d}.jpg")
        cv2.imwrite(frame_filename, frame)

        count += 1

    cap.release()

# Process each video
for index, row in df.iterrows():
    video_path = row["video_path"]
    label = str(row["label"])  # Convert label to string for folder naming

    # Create label folder
    label_folder = os.path.join(output_dir, label)
    os.makedirs(label_folder, exist_ok=True)

    # Create video folder inside label folder
    video_name = os.path.splitext(os.path.basename(video_path))[0]
    video_folder = os.path.join(label_folder, video_name)
    os.makedirs(video_folder, exist_ok=True)

    # Extract and save frames
    extract_frames(video_path, video_folder)

print("Frame extraction complete!")

Frame extraction complete!


In [None]:
# apply min-max normalization and one hot encoding

In [None]:
# Directories
input_dir = "/content/drive/MyDrive/DeepfakeDataset/train_frames"
output_dir = "/content/drive/MyDrive/DeepfakeDataset/train_processed"
os.makedirs(output_dir, exist_ok=True)

def minmax_normalize(image):
    """Normalize pixel values to range [0, 1]"""
    return image.astype(np.float32) / 255.0

def one_hot_encode(image):
    """One-hot encode an image (for RGB images, keep channels unchanged)"""
    if len(image.shape) == 2:  # Grayscale image
        return np.eye(256)[image]  # One-hot encoding for grayscale (if needed)
    else:
        return image  # RGB images already have 3 channels, so return as is

# Process all frames
for label in os.listdir(input_dir):  # 0 or 1
    label_folder = os.path.join(input_dir, label)
    processed_label_folder = os.path.join(output_dir, label)
    os.makedirs(processed_label_folder, exist_ok=True)

    for video_name in os.listdir(label_folder):
        video_folder = os.path.join(label_folder, video_name)
        processed_video_folder = os.path.join(processed_label_folder, video_name)
        os.makedirs(processed_video_folder, exist_ok=True)

        for frame_name in os.listdir(video_folder):
            frame_path = os.path.join(video_folder, frame_name)
            frame = cv2.imread(frame_path)  # Read frame

            # Normalize and One-hot encode
            frame = minmax_normalize(frame)
            frame = one_hot_encode(frame)

            # Save processed frame
            processed_frame_path = os.path.join(processed_video_folder, frame_name)
            np.save(processed_frame_path.replace(".jpg", ".npy"), frame)  # Save as .npy

print("Processing complete! Frames are saved in 'train_processed'.")


Processing complete! Frames are saved in 'train_processed'.
