In [1]:
import pandas as pd
import re
import json
from sklearn.model_selection import train_test_split
import argparse
import os


In [2]:
def clean_text(text):
    """
    Clean the input text by:
      - Converting to lowercase
      - Removing URLs, mentions, hashtags, punctuation, and extra spaces
    """
    # Lowercase the text
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions (e.g., @username)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (you may choose to keep the word; here we remove the '#' symbol)
    text = re.sub(r'#', '', text)
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [10]:
def format_for_lm(row):
    """
    Format a row for language model fine-tuning.
    Creates a prompt-response pair suitable for instruction fine-tuning.
    Includes a 3-dimensional vector representing the three classification labels.
    """
    # Get the class label
    class_label = row['class']
    
    # Create the instruction/prompt
    question = f"{row['tweet']}"
    
    # Create a 3-dimensional vector representing the labels
    # Normalize the counts to create a probability distribution
    total_annotations = row['hate_speech'] + row['offensive_language'] + row['neither']
    if total_annotations > 0:  # Avoid division by zero
        label_vector = [
            float(row['hate_speech']) / total_annotations,
            float(row['offensive_language']) / total_annotations,
            float(row['neither']) / total_annotations
        ]
    else:
        label_vector = [0.0, 0.0, 0.0]
    
    return {
        "Question": question,
        "Label": label_vector
    }


In [8]:
def process_data(input_csv, output_dir, test_size=0.2, val_size=0.1, random_state=42):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the CSV file; adjust quoting if necessary
    df = pd.read_csv(input_csv, quotechar='"')
    
    # Ensure required columns exist
    required_columns = ['tweet', 'class', 'hate_speech', 'offensive_language', 'neither']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' is missing from the dataset.")
    
    # Drop rows with missing values in required columns
    df = df.dropna(subset=required_columns)
    
    # Clean the tweet text
    df['clean_tweet'] = df['tweet'].apply(clean_text)
    print(df)
    
    # Print the class distribution
    print("Class distribution:")
    print(df['class'].value_counts())
    
    # Split the data into training+validation and test sets
    train_val_df, test_df = train_test_split(
        df, test_size=test_size, stratify=df['class'], random_state=random_state
    )
    
    # Further split training+validation into training and validation sets.
    # Calculate relative validation size from the remaining data.
    val_relative_size = val_size / (1 - test_size)
    train_df, val_df = train_test_split(
        train_val_df, test_size=val_relative_size, stratify=train_val_df['class'], random_state=random_state
    )
    
    # Format data for language model fine-tuning
    train_formatted = [format_for_lm(row) for _, row in train_df.iterrows()]
    val_formatted = [format_for_lm(row) for _, row in val_df.iterrows()]
    test_formatted = [format_for_lm(row) for _, row in test_df.iterrows()]
    
    # Save the formatted data as JSON files
    train_json_path = os.path.join(output_dir, "train.json")
    val_json_path = os.path.join(output_dir, "val.json")
    test_json_path = os.path.join(output_dir, "test.json")
    
    with open(train_json_path, 'w') as f:
        json.dump(train_formatted, f, indent=2)
    
    with open(val_json_path, 'w') as f:
        json.dump(val_formatted, f, indent=2)
    
    with open(test_json_path, 'w') as f:
        json.dump(test_formatted, f, indent=2)
    
    # Also save a single JSONL file for easy loading with datasets library
    train_jsonl_path = os.path.join(output_dir, "train.jsonl")
    with open(train_jsonl_path, 'w') as f:
        for item in train_formatted:
            f.write(json.dumps(item) + '\n')
    
    print(f"Training data saved to {train_json_path} and {train_jsonl_path} ({len(train_formatted)} samples)")
    print(f"Validation data saved to {val_json_path} ({len(val_formatted)} samples)")
    print(f"Test data saved to {test_json_path} ({len(test_formatted)} samples)")

In [11]:
input_csv="../data/labeled_data.csv"
output_dir="../data/processed_data/"
test_size=0.2
val_size=0.1
random_state=42

process_data(
    input_csv=input_csv,
    output_dir=output_dir,
    test_size=test_size,
    val_size=val_size,
    random_state=random_state,
)

       Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0               0      3            0                   0        3      2   
1               1      3            0                   3        0      1   
2               2      3            0                   3        0      1   
3               3      3            0                   2        1      1   
4               4      6            0                   6        0      1   
...           ...    ...          ...                 ...      ...    ...   
24778       25291      3            0                   2        1      1   
24779       25292      3            0                   1        2      2   
24780       25294      3            0                   3        0      1   
24781       25295      6            0                   6        0      1   
24782       25296      3            0                   0        3      2   

                                                   tweet  \
0      !!! RT @