In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df_ex = pd.read_json("Data/Extended/ex_train.jsonl", lines=True)

print("Total number of new/augmented non-hateful memes in Extended Dataset: ", len(df_ex))

Total number of new/augmented non-hateful memes in Extended Dataset:  2479


In [None]:
# Printing label distribution in the original and extended datasets
# The function prints the label counts for training, validation
###############################################################################
# Note that we are not using val_seen.jsonl and val_unseen.jsonl to avoid distribution shift in case of extended dataset
# so to keep the validation set consistent, we are splitting the training set into train and validation
###############################################################################
# It also shows the label distribution in the extended dataset, if applicable
# The function takes a boolean parameter `isExt` to determine if the extended dataset is used
# It also uses a random seed for reproducibility in sampling (same as used in the original train/val split)
def print_stats(isExt, RANDOM_SEED=42):
    # Load your data
    train_df = pd.read_json("Data/Original/train.jsonl", lines=True)
    if(isExt):
        df_train_ex = pd.read_json("Data/Extended/ex_train.jsonl", lines=True)

        train_df = pd.concat([train_df, df_train_ex], ignore_index=True)

    def balance_dataset(df, label_col='label'):
        # Split by class
        class_counts = df[label_col].value_counts()
        minority_class = class_counts.idxmin()
        majority_class = class_counts.idxmax()

        # Sample from majority class to match minority count
        df_minority = df[df[label_col] == minority_class]
        df_majority = df[df[label_col] == majority_class].sample(n=len(df_minority), random_state=RANDOM_SEED)

        # Concatenate and shuffle
        balanced_df = pd.concat([df_minority, df_majority]).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
        return balanced_df

    train_df = balance_dataset(train_df)

    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=RANDOM_SEED, stratify=train_df['label'])

    test_df_seen = pd.read_json("Data/Original/test_seen.jsonl", lines=True)
    test_df_unseen = pd.read_json("Data/Original/test_unseen.jsonl", lines=True)
    test_df = pd.concat([test_df_seen, test_df_unseen], ignore_index=True)

    if(not isExt):
        print("(Original) Train/Val/Test Set Label Counts:")
    else:
        print("(Extended) Train/Val/Test Set Label Counts:")
    print("Train:")
    print(train_df.label.value_counts())
    print("Validation:")
    print(val_df.label.value_counts())
    print("Test:")
    print(test_df.label.value_counts())

    if(isExt):
        print("\n\nLabel distribution (original vs aug) in Extended Dataset:")
        print("Train:")
        print(train_df[train_df['img'].str.contains("ex_")].label.value_counts())
        print("Validation:")
        print(val_df[val_df['img'].str.contains("ex_")].label.value_counts())

print_stats(False)
print("\n" + "#" * 50 + "\n")
print_stats(True)

(Original) Train/Val/Test Set Label Counts:
Train:
label
1    4779
0    4779
Name: count, dtype: int64
Validation:
label
1    1195
0    1195
Name: count, dtype: int64
Test:
label
0    1760
1    1240
Name: count, dtype: int64

##################################################

(Extended) Train/Val/Test Set Label Counts:
Train:
label
0    5530
1    5530
Name: count, dtype: int64
Validation:
label
0    1383
1    1383
Name: count, dtype: int64
Test:
label
0    1760
1    1240
Name: count, dtype: int64


Label distribution (original vs aug) in Extended Dataset:
Train:
label
0    1655
Name: count, dtype: int64
Validation:
label
0    381
Name: count, dtype: int64


In [8]:
# Stats for testing dataset
df_seen = pd.read_json("Data/Original/test_seen.jsonl", lines=True)
df_unseen = pd.read_json("Data/Original/test_unseen.jsonl", lines=True)
df_test = pd.concat([df_seen, df_unseen], ignore_index=True)

print("Test Set Label Counts:")
print("#" * 50)
print("Seen:")
print(df_seen.label.value_counts())
print("#" * 50)
print("Unseen:")
print(df_unseen.label.value_counts())
print("#" * 50)
print("Combined Test:")
print(df_test.label.value_counts())
print("#" * 50)
print("Total number of memes in Test Set: ", len(df_test))

Test Set Label Counts:
##################################################
Seen:
label
0    510
1    490
Name: count, dtype: int64
##################################################
Unseen:
label
0    1250
1     750
Name: count, dtype: int64
##################################################
Combined Test:
label
0    1760
1    1240
Name: count, dtype: int64
##################################################
Total number of memes in Test Set:  3000
