In [3]:
import os
import pandas as pd

# Define paths
dataset_path = r'C:\Users\rohit\Desktop\hate\hateful_memes\img'
labels_file = r'C:\Users\rohit\Desktop\hate\hateful_memes\labels.csv'

# Check if the labels file exists
if not os.path.exists(labels_file):
    print("Labels file not found. Creating a new labels file.")
    
    # List all images
    images = [f for f in os.listdir(dataset_path) if f.endswith('.png')]
    
    # Here, we're assuming you have a method to manually or programmatically label images
    # For demonstration, we'll randomly assign labels (adjust this part accordingly)
    labels = ['hateful' if i % 2 == 0 else 'not_hateful' for i in range(len(images))]
    
    # Create a DataFrame
    data = {'filename': images, 'label': labels}
    df = pd.DataFrame(data)
    
    # Save to CSV
    df.to_csv(labels_file, index=False)
    print(f"Created labels file with {len(images)} entries.")
else:
    print("Labels file already exists.")


Labels file not found. Creating a new labels file.
Created labels file with 12140 entries.


In [4]:
import shutil
import random

# Define paths again to ensure consistency
train_path = os.path.join(dataset_path, 'train')
validation_path = os.path.join(dataset_path, 'validation')
train_hateful_path = os.path.join(train_path, 'hateful')
train_not_hateful_path = os.path.join(train_path, 'not_hateful')
validation_hateful_path = os.path.join(validation_path, 'hateful')
validation_not_hateful_path = os.path.join(validation_path, 'not_hateful')

# Create train and validation directories if they don't exist
os.makedirs(train_hateful_path, exist_ok=True)
os.makedirs(train_not_hateful_path, exist_ok=True)
os.makedirs(validation_hateful_path, exist_ok=True)
os.makedirs(validation_not_hateful_path, exist_ok=True)

# Read the CSV file
labels_df = pd.read_csv(labels_file)

# Separate images by class
hateful_images = labels_df[labels_df['label'] == 'hateful']['filename'].tolist()
not_hateful_images = labels_df[labels_df['label'] == 'not_hateful']['filename'].tolist()

# Check we have enough images
assert len(hateful_images) >= 3000, "Not enough hateful images"
assert len(not_hateful_images) >= 3000, "Not enough not-hateful images"

# Shuffle images
random.shuffle(hateful_images)
random.shuffle(not_hateful_images)

# Split images
train_hateful_images = hateful_images[:2400]
validation_hateful_images = hateful_images[2400:3000]
train_not_hateful_images = not_hateful_images[:2400]
validation_not_hateful_images = not_hateful_images[2400:3000]

# Function to move images
def move_images(images, dest_dir):
    for image in images:
        src_file = os.path.join(dataset_path, image)
        dst_file = os.path.join(dest_dir, image)
        shutil.move(src_file, dst_file)
        print(f"Moved {image} to {dest_dir}")

# Move images to train and validation sets
print("Moving images to train set:")
move_images(train_hateful_images, train_hateful_path)
move_images(train_not_hateful_images, train_not_hateful_path)

print("\nMoving images to validation set:")
move_images(validation_hateful_images, validation_hateful_path)
move_images(validation_not_hateful_images, validation_not_hateful_path)

print("\nDataset splitting and organization completed successfully.")


Moving images to train set:
Moved 30478.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 34952.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 43859.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 38907.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 78293.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 41068.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 71089.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 45708.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 06458.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 41268.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 15630.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 60785.png to C:\Users\rohit\Desktop\hate\hateful_memes\img\train\hateful
Moved 20736.png to C:\Us

In [5]:
def count_images(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".png"):
                count += 1
    return count

# Count images in train and validation sets
train_hateful_count = count_images(train_hateful_path)
train_not_hateful_count = count_images(train_not_hateful_path)
validation_hateful_count = count_images(validation_hateful_path)
validation_not_hateful_count = count_images(validation_not_hateful_path)

# Print the results
print(f"Number of hateful images in the train set: {train_hateful_count}")
print(f"Number of not hateful images in the train set: {train_not_hateful_count}")
print(f"Number of hateful images in the validation set: {validation_hateful_count}")
print(f"Number of not hateful images in the validation set: {validation_not_hateful_count}")


Number of hateful images in the train set: 2400
Number of not hateful images in the train set: 2400
Number of hateful images in the validation set: 600
Number of not hateful images in the validation set: 600


In [6]:
!pip install numpy pandas matplotlib opencv-python torch torchvision


