In [6]:
import pandas as pd
import torch
import numpy as np
import os
import re
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

DATA_DIR = ""  # Set this if you're not in the same directory
SAVE_DIR = "preprocessed_data"
RANDOM_STATE = 42
VAL_SIZE = 0.1

os.makedirs(SAVE_DIR, exist_ok=True)

Using device: cuda


In [7]:
print("Loading main datasets...")
train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Loading annotation data...")
toxicity_ann = pd.read_csv(os.path.join(DATA_DIR, "toxicity_individual_annotations.csv"))
identity_ann = pd.read_csv(os.path.join(DATA_DIR, "identity_individual_annotations.csv"))

Loading main datasets...
Loading annotation data...


In [8]:
import pandas as pd

def fast_mode_aggregation(df, group_col):
    result_df = pd.DataFrame()
    result_df[group_col] = df[group_col].unique()
    result_df.set_index(group_col, inplace=True)

    for col in df.columns:
        if col == group_col:
            continue
        print(f"Processing column: {col}")
        # Use mode per group
        mode_series = df.groupby(group_col)[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0])
        result_df[col] = mode_series

    result_df.reset_index(inplace=True)
    return result_df

In [9]:
# Load the individual annotation files
toxicity_ann = pd.read_csv("toxicity_individual_annotations.csv")
identity_ann = pd.read_csv("identity_individual_annotations.csv")

print("Aggregating toxicity annotations by mode...")
toxicity_agg = fast_mode_aggregation(toxicity_ann, "id")
print("Aggregating identity annotations by mode...")
identity_agg = fast_mode_aggregation(identity_ann, "id")

# Load train.csv
train = pd.read_csv("train.csv")

# Merge annotations into the main training data
train = train.merge(toxicity_agg, on="id", how="left")
train = train.merge(identity_agg, on="id", how="left")

# Save the merged result
train.to_csv("preprocessed_data/train_with_annotations.csv", index=False)
print("Merged and saved to 'train_annotations.csv'")

Aggregating toxicity annotations by mode...
Processing column: worker
Processing column: toxic
Processing column: severe_toxic
Processing column: identity_attack
Processing column: insult
Processing column: obscene
Processing column: sexual_explicit
Processing column: threat
Aggregating identity annotations by mode...
Processing column: worker
Processing column: disability
Processing column: gender
Processing column: race_or_ethnicity
Processing column: religion
Processing column: sexual_orientation
Merged and saved to 'train_annotations.csv'


In [10]:
# Stratify by binary target for balanced split
train_split, val_split = train_test_split(
    train,
    test_size=VAL_SIZE,
    stratify=(train["target"] >= 0.5).astype(int),
    random_state=RANDOM_STATE
)

# Save splits
train_split.to_csv(os.path.join(SAVE_DIR, "train_clean.csv"), index=False)
val_split.to_csv(os.path.join(SAVE_DIR, "val_clean.csv"), index=False)

# Load the provided test.csv (no labels)
test = pd.read_csv("test.csv")
test.to_csv(os.path.join(SAVE_DIR, "test_clean.csv"), index=False)

print("✅ Saved: train_clean.csv, val_clean.csv, test_clean.csv")

✅ Saved: train_clean.csv, val_clean.csv, test_clean.csv


In [14]:
from google.colab import files

files.download('preprocessed_data/train_clean.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:
files.download('preprocessed_data/val_clean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>