In [11]:
import cv2
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Define your directories
real_video_directories = ['Celeb-real\\', 'YouTube-real\\']
fake_video_directory = 'Celeb-synthesis\\'

# Create a list to store the file paths, labels, and video properties
video_list = []

def get_video_properties(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None, None, None
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    return f"{width}x{height}", frames, fps

# Add real and fake videos to the list with appropriate labels and properties
for directory, label in zip(real_video_directories + [fake_video_directory], [1, 1, 0]):
    for video_file in os.listdir(directory):
        if video_file.endswith('.mp4'):
            path = os.path.join(directory, video_file)
            resolution, frame_count, fps = get_video_properties(path)
            video_list.append({
                'path': path,
                'label': label,
                'resolution': resolution,
                'frame_count': frame_count,
                'fps': fps
            })

# Create a DataFrame
df = pd.DataFrame(video_list)

# Read the list of test videos and continue as in your script
test_list_path = 'List_of_testing_videos.txt'
test_videos = pd.read_csv(test_list_path, delim_whitespace=True, names=['label', 'path'])

# Map test video paths to their full paths in the dataframe
test_videos['path'] = test_videos['path'].apply(lambda x: os.path.join('', x))

# Mark the videos in the main DataFrame as test videos
df['set'] = 'train_val'
df.loc[df['path'].isin(test_videos['path']), 'set'] = 'test'

# Filter out the test set
train_val_df = df[df['set'] != 'test']

# Split the train_val DataFrame into train and validation sets
train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=28, stratify=train_val_df['label'])

# Add set labels for train and val
train_df['set'] = 'train'
val_df['set'] = 'val'

# Assuming 'test_videos' and 'df' both have a 'label' column and you want to keep 'test_videos' labels

# You can use the .drop() method to drop the label from 'df' before joining, if the labels should be the same:
test_videos = test_videos.merge(df.drop('label', axis=1), on='path', how='left')

# If you need to keep both labels to compare or use them separately, use suffixes:
# test_videos = test_videos.merge(df, on='path', how='left', suffixes=('', '_from_df'))

# Now finalize adding set labels and ensure you filter columns as needed
test_videos['set'] = 'test'

# Combine all DataFrames: make sure to handle all necessary columns
final_df = pd.concat([train_df, val_df, test_videos])

# Saving the CSV files as previously described:
final_df.to_csv('full_dataset.csv', index=False)
train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_videos.to_csv('test_data.csv', index=False)


ValueError: columns overlap but no suffix specified: Index(['label'], dtype='object')