In [1]:
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# ===== CONFIGURATION =====
DATASET_ROOT = "G:/Project/DeepFake_Video_Detection/Dataset"  # folder containing 'original/', 'Deepfakes/', etc.
ALPHA_DATASET_ROOT = "G:/Project/DeepFake_Video_Detection/Dataset/Alpha_Dataset"  # folder containing 'original/', 'Deepfakes/', etc.
CSV_FOLDER = os.path.join(ALPHA_DATASET_ROOT, "csv")  # folder containing the CSV metadata
TEST_SIZE = 0.15
VAL_SIZE = 0.15
RANDOM_SEED = 42

In [3]:
# ===== LOAD ALL CSVs =====
csv_files = glob.glob(os.path.join(CSV_FOLDER, "*.csv"))
all_dfs = []

for csv_path in csv_files:
    df = pd.read_csv(csv_path)
    # Ensure column names are standardized
    df = df.rename(columns={"File Path": "file_path", "Label": "label"})
    # Create absolute paths
    df["abs_path"] = df["file_path"].apply(lambda x: os.path.join(ALPHA_DATASET_ROOT, x))
    all_dfs.append(df)

# Merge into a single DataFrame
full_df = pd.concat(all_dfs, ignore_index=True)

print(f"Total videos listed in CSVs: {len(full_df)}")
print(full_df.head())

Total videos listed in CSVs: 1750
   Unnamed: 0              file_path label  \
0           6  Deepfakes/006_002.mp4  FAKE   
1          22  Deepfakes/022_489.mp4  FAKE   
2          27  Deepfakes/027_009.mp4  FAKE   
3          30  Deepfakes/030_193.mp4  FAKE   
4          32  Deepfakes/032_944.mp4  FAKE   

                                            abs_path  
0  G:/Project/DeepFake_Video_Detection/Dataset/Al...  
1  G:/Project/DeepFake_Video_Detection/Dataset/Al...  
2  G:/Project/DeepFake_Video_Detection/Dataset/Al...  
3  G:/Project/DeepFake_Video_Detection/Dataset/Al...  
4  G:/Project/DeepFake_Video_Detection/Dataset/Al...  


In [4]:
# ===== VERIFY FILES EXIST =====
missing_files = [p for p in full_df["abs_path"] if not os.path.exists(p)]
if missing_files:
    print(f"[WARNING] Missing {len(missing_files)} files:")
    for mf in missing_files:
        print(mf)
else:
    print("✅ All files exist.")

✅ All files exist.


In [5]:
# ===== SUMMARIZE CLASS DISTRIBUTION =====
print("\nClass distribution:")
print(full_df["label"].value_counts())


Class distribution:
label
REAL    1000
FAKE     750
Name: count, dtype: int64


In [6]:
# ===== STRATIFIED SPLIT =====
# First split: train+val vs test
train_val_df, test_df = train_test_split(
    full_df,
    test_size=TEST_SIZE,
    stratify=full_df["label"],
    random_state=RANDOM_SEED
)

# Second split: train vs val
val_relative_size = VAL_SIZE / (1 - TEST_SIZE)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=val_relative_size,
    stratify=train_val_df["label"],
    random_state=RANDOM_SEED
)

In [7]:
# ===== PRINT COUNTS =====
def summarize_split(name, df):
    counts = df["label"].value_counts().to_dict()
    print(f"{name} set: {len(df)} samples | {counts}")

summarize_split("Train", train_df)
summarize_split("Validation", val_df)
summarize_split("Test", test_df)

Train set: 1224 samples | {'REAL': 700, 'FAKE': 524}
Validation set: 263 samples | {'REAL': 150, 'FAKE': 113}
Test set: 263 samples | {'REAL': 150, 'FAKE': 113}


In [8]:
# ===== PREVIEW SAMPLES =====
print("\nSample Train entries:")
print(train_df.sample(5, random_state=RANDOM_SEED))

print("\nSample Validation entries:")
print(val_df.sample(5, random_state=RANDOM_SEED))

print("\nSample Test entries:")
print(test_df.sample(5, random_state=RANDOM_SEED))


Sample Train entries:
      Unnamed: 0              file_path label  \
1097         347       original/347.mp4  REAL   
942          192       original/192.mp4  REAL   
1216         466       original/466.mp4  REAL   
239          595  Face2Face/595_597.mp4  FAKE   
787           37       original/037.mp4  REAL   

                                               abs_path  
1097  G:/Project/DeepFake_Video_Detection/Dataset/Al...  
942   G:/Project/DeepFake_Video_Detection/Dataset/Al...  
1216  G:/Project/DeepFake_Video_Detection/Dataset/Al...  
239   G:/Project/DeepFake_Video_Detection/Dataset/Al...  
787   G:/Project/DeepFake_Video_Detection/Dataset/Al...  

Sample Validation entries:
      Unnamed: 0                file_path label  \
1730         980         original/980.mp4  REAL   
386          597  FaceShifter/597_595.mp4  FAKE   
1059         309         original/309.mp4  REAL   
831           81         original/081.mp4  REAL   
1545         795         original/795.mp4  REAL   


In [9]:
# ===== OPTIONAL: Save splits to CSV =====
train_df.to_csv(os.path.join(DATASET_ROOT, "train_split.csv"), index=False)
val_df.to_csv(os.path.join(DATASET_ROOT, "val_split.csv"), index=False)
test_df.to_csv(os.path.join(DATASET_ROOT, "test_split.csv"), index=False)

print("\n✅ Stage 1 completed successfully.")



✅ Stage 1 completed successfully.
