In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os


In [None]:
DATA_DIR = "../data"  
SPLIT_DIR = "../data/split" 

In [24]:
datasets = {
    "CoAID": os.path.join(DATA_DIR, "CoAID_Cleaned.csv"),
    "FakeNewsNet": os.path.join(DATA_DIR, "FakeNewsNet_Cleaned.csv"),
    "WELFake": os.path.join(DATA_DIR, "WELFake_Cleaned.csv")
}

## Standard Split Function

In [25]:
def standard_split(df, label_col="label", test_size=0.2, val_size=0.1, random_state=42):                                # Function to split dataset into train, validation, and test sets
    train_df, temp_df = train_test_split(df, test_size=test_size, random_state=random_state, stratify=df[label_col])    # Splitting into train and temp (val+test)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=random_state, stratify=temp_df[label_col])  # Splitting temp into val and test
    return train_df, val_df, test_df        

## Iterate Over Datasets

In [27]:
for name, path in datasets.items():                                             # Iterating over each dataset
    print(f"\nProcessing {name} dataset...")    
    df = pd.read_csv(path)              
    os.makedirs(os.path.join(SPLIT_DIR, name), exist_ok=True)                   # Creating directory for splits if not exists
    
    train_df, val_df, test_df = standard_split(df, label_col="label")           # Performing standard split
    
    train_df.to_csv(os.path.join(SPLIT_DIR, name, "train.csv"), index=False)    # Saving splits to CSV files
    val_df.to_csv(os.path.join(SPLIT_DIR, name, "val.csv"), index=False)
    test_df.to_csv(os.path.join(SPLIT_DIR, name, "test.csv"), index=False)
    
    print(f"Saved standard splits: train({len(train_df)}), val({len(val_df)}), test({len(test_df)})")

print("\nAll datasets processed and saved under 'data/split/'")


Processing CoAID dataset...
Saved standard splits: train(2463), val(308), test(308)

Processing FakeNewsNet dataset...
Saved standard splits: train(182), val(23), test(23)

Processing WELFake dataset...
Saved standard splits: train(50942), val(6368), test(6368)

All datasets processed and saved under 'data/split/'


## Advanced Open-World Setup (Split)

In [None]:
SPLIT_DIR = "../data/split"                        # Already contains standard splits
datasets = ["CoAID", "FakeNewsNet", "WELFake"]

## Advanced Open-World Split Function

In [29]:
def advanced_open_world_split(df, source_col="source", topic_col="category", unseen_ratio=0.2, random_state=42):     # Function to create open-world splits
    sources = df[source_col].unique()                                                                                # Unique sources in the dataset
    train_sources, unseen_sources = train_test_split(sources, test_size=unseen_ratio, random_state=random_state)     # Splitting sources into seen and unseen
    
    topics = df[topic_col].unique()                                                                                  # Unique topics in the dataset
    train_topics, unseen_topics = train_test_split(topics, test_size=unseen_ratio, random_state=random_state)        # Splitting topics into seen and unseen
    
    ow_train = df[df[source_col].isin(train_sources) & df[topic_col].isin(train_topics)].reset_index(drop=True)      # Training set with seen sources and topics
    ow_test = df[(df[source_col].isin(unseen_sources)) | (df[topic_col].isin(unseen_topics))].reset_index(drop=True) # Test set with unseen sources or topics
    
    return ow_train, ow_test

## Iterate Over Datasets

In [30]:
for name in datasets:                                                                               # Iterating over each dataset
    print(f"\nProcessing {name} dataset for advanced open-world setup...")
    path = os.path.join(SPLIT_DIR, name, "train.csv")                                               # use standard train as base
    df = pd.read_csv(path)                                                                          # Loading the training data
    
    source_col = "source" if "source" in df.columns else df.columns[0]                              # Determining source column
    topic_col = "category" if "category" in df.columns else df.columns[0]                           # Determining topic column
    
    ow_train, ow_test = advanced_open_world_split(df, source_col=source_col, topic_col=topic_col)   # Creating open-world splits
    
    ow_train.to_csv(os.path.join(SPLIT_DIR, name, "open_world_train.csv"), index=False)             # Saving open-world splits
    ow_test.to_csv(os.path.join(SPLIT_DIR, name, "open_world_test.csv"), index=False)               # Saving open-world splits
    
    print(f"Saved advanced open-world splits: train({len(ow_train)}), test({len(ow_test)})")        # Logging the sizes

print("\nAdvanced open-world setup complete!")


Processing CoAID dataset for advanced open-world setup...
Saved advanced open-world splits: train(1968), test(495)

Processing FakeNewsNet dataset for advanced open-world setup...
Saved advanced open-world splits: train(148), test(34)

Processing WELFake dataset for advanced open-world setup...
Saved advanced open-world splits: train(40753), test(10189)

Advanced open-world setup complete!
