In [1]:
import os, json, re, math, joblib, numpy as np, pandas as pd
from datetime import datetime
from collections import Counter
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from scipy.sparse import hstack, csr_matrix

In [2]:
PROJECT_DIR = Path.cwd().parent
DATA_DIR = PROJECT_DIR / "data"
ARTIFACTS_DIR = PROJECT_DIR / "artifacts"

ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
TS = datetime.now().strftime("%Y%m%d_%H%M%S")
print("Artifacts:", ARTIFACTS_DIR, "| Timestamp:", TS)

Artifacts: C:\Data Science\Cyberbullying detection\cyberbullying-detection-psychology\artifacts | Timestamp: 20250905_214142


In [3]:
cyberbullying_df = pd.read_parquet(DATA_DIR / "cleaned_cyberbullying.parquet")
cyberbullying_df.head(10)

Unnamed: 0,tweet_text,cyberbullying_type,is_bullying,word_len,only_mentions,tweet_text_clean,lemmatized_text
0,"In other words #katandandre, your food was cra...",not_cyberbullying,0,9,False,in other words katandandre your food was crapi...,word katandandre food crapilicious mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,0,14,False,why is aussietv so white mkr theblock imaceleb...,aussietv white mkr theblock imacelebrityau tod...
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,0,9,False,user a classy whore or more red velvet cupcakes,user classy whore red velvet cupcake
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,0,18,False,user meh p thanks for the heads up but not too...,user meh p thanks head not concern another ang...
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,0,18,False,user this is an isis account pretending to be ...,user isi account pretend kurdish account like ...
5,"@Raja5aab @Quickieleaks Yes, the test of god i...",not_cyberbullying,0,23,False,user user yes the test of god is that good or ...,user user yes test god good bad indifferent we...
6,Itu sekolah ya bukan tempat bully! Ga jauh kay...,not_cyberbullying,0,10,False,itu sekolah ya bukan tempat bully ga jauh kaya...,itu sekolah ya bukan tempat bully ga jauh kaya...
7,Karma. I hope it bites Kat on the butt. She is...,not_cyberbullying,0,14,False,karma i hope it bites kat on the butt she is j...,karma hope bite kat butt nasty mkr
8,@stockputout everything but mostly my priest,not_cyberbullying,0,6,False,user everything but mostly my priest,user everything mostly priest
9,Rebecca Black Drops Out of School Due to Bully...,not_cyberbullying,0,9,False,rebecca black drops out of school due to bullying,rebecca black drops school due bullying


In [4]:
print("Cyberbullying — class counts:")
print(cyberbullying_df["cyberbullying_type"].value_counts())
print("\nCyberbullying — is_bullying (0=not, 1=bullying):")
print(cyberbullying_df["is_bullying"].value_counts(dropna=False))

Cyberbullying — class counts:
cyberbullying_type
religion               7998
age                    7992
gender                 7960
ethnicity              7960
not_cyberbullying      7927
other_cyberbullying    7753
Name: count, dtype: int64

Cyberbullying — is_bullying (0=not, 1=bullying):
is_bullying
1    39663
0     7927
Name: count, dtype: int64


In [5]:
toxic_df = pd.read_parquet(DATA_DIR / "cleaned_jigsaw.parquet")
toxic_df.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,label_count,any_toxic,clean_text,simple_tokens
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0,0,explanation why the edits made under my userna...,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...,d'aww matches background colour seemingly stuc...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it...",hey man really not trying edit war guy constan...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0,0,""" more i can't make any real suggestions on im...",can't make real suggestions improvement wonder...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...",sir hero chance remember page that's
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,0,0,""" congratulations from me as well, use the too...",congratulations well use tools well talk
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,4,1,cocksucker before you piss around on my work,cocksucker piss around work
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,0,0,your vandalism to the matt shirvington article...,vandalism matt shirvington article reverted pl...
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,0,0,sorry if the word 'nonsense' was offensive to ...,sorry word 'nonsense' offensive anyway not int...
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,0,0,alignment on this subject and which are contra...,alignment subject contrary dulithgow


In [6]:
print("\nToxic — labels per comment (label_count):")
print(toxic_df["label_count"].value_counts().sort_index())

print("\nToxic — any_toxic (0=clean, 1=toxic-any):")
print(toxic_df["any_toxic"].value_counts(dropna=False))


Toxic — labels per comment (label_count):
label_count
0    143346
1      6360
2      3480
3      4209
4      1760
5       385
6        31
Name: count, dtype: int64

Toxic — any_toxic (0=clean, 1=toxic-any):
any_toxic
0    143346
1     16225
Name: count, dtype: int64


#### Make train / dev / test splits (70/10/20)

I created stable 70/10/20 splits for both datasets.
For the cyberbullying data, I stratified by cyberbullying_type so train/dev/test keep the same class proportions.
For the Wikipedia (Jigsaw) data, I stratified by a coarse label_count bin (0, 1, ≥2) so the balance of clean vs. labeled comments is consistent across splits.
I saved the row indices for each split to JSON to make the workflow reproducible and to prevent data leakage—train is for learning, dev for tuning, and test stays untouched for final evaluation.
I set a fixed random seed so the splits are stable across runs.

In [7]:
# Stratify tweets by the multiclass label: 'cyberbullying_type'

cb_all_idx = np.arange(len(cyberbullying_df))
cb_labels  = cyberbullying_df["cyberbullying_type"]

# First split: 80% (train+dev) and 20% (test)

cb_trdev_idx, cb_test_idx = train_test_split(
    cb_all_idx,
    test_size=0.20,
    stratify=cb_labels,
    random_state=RANDOM_STATE
)

# Second split: from train+dev 12.5% for dev (10% of full data)

cb_trdev_labels = cb_labels.iloc[cb_trdev_idx]
cb_train_idx, cb_dev_idx = train_test_split(
    cb_trdev_idx,
    test_size=0.125,
    stratify=cb_trdev_labels,
    random_state=RANDOM_STATE
)

# Save
cb_splits = {
    "train": cb_train_idx.tolist(),
    "dev":   cb_dev_idx.tolist(),
    "test":  cb_test_idx.tolist(),
}
with open(os.path.join(ARTIFACTS_DIR, "cb_splits.json"), "w") as f:
    json.dump(cb_splits, f, indent=2)
print("Saved cyberbullying splits to artifacts/cb_splits.json")

Saved cyberbullying splits to artifacts/cb_splits.json


In [8]:
# Quick sanity prints (sizes and label proportions)
print("\n[Cyberbullying] sizes:",
      "train =", len(cb_train_idx),
      "dev =", len(cb_dev_idx),
      "test =", len(cb_test_idx))

print("[Cyberbullying] label proportions by split (should look similar):")
for name, idx in [("train", cb_train_idx), ("dev", cb_dev_idx), ("test", cb_test_idx)]:
    props = cyberbullying_df.iloc[idx]["cyberbullying_type"].value_counts(normalize=True)
    print(f"  {name} →", props.round(3).to_dict())



[Cyberbullying] sizes: train = 33313 dev = 4759 test = 9518
[Cyberbullying] label proportions by split (should look similar):
  train → {'religion': 0.168, 'age': 0.168, 'gender': 0.167, 'ethnicity': 0.167, 'not_cyberbullying': 0.167, 'other_cyberbullying': 0.163}
  dev → {'religion': 0.168, 'age': 0.168, 'ethnicity': 0.167, 'gender': 0.167, 'not_cyberbullying': 0.167, 'other_cyberbullying': 0.163}
  test → {'religion': 0.168, 'age': 0.168, 'ethnicity': 0.167, 'gender': 0.167, 'not_cyberbullying': 0.167, 'other_cyberbullying': 0.163}


In [9]:
# Stratify jigsaw data by a coarse version of label_count: 0, 1, 2+

tx_all_idx = np.arange(len(toxic_df))
tx_bins = toxic_df["label_count"].clip(0, 2)  # 0 stays 0, 1 stays 1, 2/3/4/5/6 become 2

# First split: 80% (train+dev) and 20% (test)
tx_trdev_idx, tx_test_idx = train_test_split(
    tx_all_idx,
    test_size=0.20,
    stratify=tx_bins,
    random_state=RANDOM_STATE
)

# Second split: from train+dev 12.5% for dev (10% of full data)
tx_trdev_bins = tx_bins.iloc[tx_trdev_idx]
tx_train_idx, tx_dev_idx = train_test_split(
    tx_trdev_idx,
    test_size=0.125,
    stratify=tx_trdev_bins,
    random_state=RANDOM_STATE
)

# Save
tx_splits = {
    "train": tx_train_idx.tolist(),
    "dev":   tx_dev_idx.tolist(),
    "test":  tx_test_idx.tolist(),
}

with open(os.path.join(ARTIFACTS_DIR, "tx_splits.json"), "w") as f:
    json.dump(tx_splits, f, indent=2)
print("Saved toxic splits to artifacts/tx_splits.json")

Saved toxic splits to artifacts/tx_splits.json


In [10]:
# Sanity check (sizes and label_count proportions)
print("\n[Toxic] sizes:",
      "train =", len(tx_train_idx),
      "dev =", len(tx_dev_idx),
      "test =", len(tx_test_idx))

print("[Toxic] label_count bins {0,1,2+} by split (should look similar):")
for name, idx in [("train", tx_train_idx), ("dev", tx_dev_idx), ("test", tx_test_idx)]:
    props = toxic_df.iloc[idx]["label_count"].clip(0,2).value_counts(normalize=True).sort_index()
    # pretty print as dict: {0: 0.xxx, 1: 0.xxx, 2: 0.xxx}
    print(f"  {name} →", {int(k): round(v, 3) for k, v in props.items()})


[Toxic] sizes: train = 111699 dev = 15957 test = 31915
[Toxic] label_count bins {0,1,2+} by split (should look similar):
  train → {0: 0.898, 1: 0.04, 2: 0.062}
  dev → {0: 0.898, 1: 0.04, 2: 0.062}
  test → {0: 0.898, 1: 0.04, 2: 0.062}


#### Word TF–IDF

I use TF–IDF because it gives me a solid, interpretable baseline that connects directly to my research questions. I need a feature space that (a) works the same way on both datasets, (b) highlights informative language (insults, slurs, threats, requests), and (c) lets me see what the model is using. TF–IDF does that: term frequency captures how much a word or phrase appears in a comment, and inverse document frequency down-weights routine words. That means coordination terms on Wikipedia (“article”, “page”, “thanks”) get naturally de-emphasized, while hostile or targeted language pops out.

I loaded the same train/dev/test splits I saved earlier. I then took the text I want to model: lemmatized_text for the cyberbullying tweets and simple_tokens for the Jigsaw comments. I turned those texts into numbers with a word TF–IDF vectorizer. I count how often each word (and each two-word phrase) appears in a document, then down-weight words that are very common across many documents. That gives me features that highlight informative words and short phrases.

I made one vectorizer per dataset (Twitter and Wikipedia), because the vocabularies differ. For each dataset, I fit the vectorizer on the training split only (to avoid peeking at dev/test). I used unigrams and bigrams (ngram_range=(1,2)), ignored ultra-rare terms (min_df=3), and capped the vocabulary size (max_features=50_000) so the matrices don’t blow up. After fitting on train, I transformed the dev and test splits with the same fitted vectorizer, so all three splits live in the exact same feature space.

I saved both fitted vectorizers to the artifacts/ folder so later notebooks can reload them and get identical features. Finally, I printed the matrix shapes to sanity-check things: the number of rows matches the number of examples in each split, and the number of columns equals the learned vocabulary size for that dataset.

In [11]:
# Load the splits 
with open(os.path.join(ARTIFACTS_DIR, "cb_splits.json")) as f:
    cb_splits = json.load(f)
with open(os.path.join(ARTIFACTS_DIR, "tx_splits.json")) as f:
    tx_splits = json.load(f)

# Slice the dataframes into train/dev/test
cb_train = cyberbullying_df.iloc[cb_splits["train"]]
cb_dev   = cyberbullying_df.iloc[cb_splits["dev"]]
cb_test  = cyberbullying_df.iloc[cb_splits["test"]]

tx_train = toxic_df.iloc[tx_splits["train"]]
tx_dev   = toxic_df.iloc[tx_splits["dev"]]
tx_test  = toxic_df.iloc[tx_splits["test"]]

# Pull the text columns to vectorize 
cb_train_text = cb_train["lemmatized_text"]
cb_dev_text   = cb_dev["lemmatized_text"]
cb_test_text  = cb_test["lemmatized_text"]

tx_train_text = tx_train["simple_tokens"]
tx_dev_text   = tx_dev["simple_tokens"]
tx_test_text  = tx_test["simple_tokens"]

# Make the TF–IDF vectorizers 
cb_word_vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_features=50_000)
tx_word_vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_features=50_000)

# Fit on TRAIN only
Xw_cb_train = cb_word_vec.fit_transform(cb_train_text)
Xw_tx_train = tx_word_vec.fit_transform(tx_train_text)

# Ttansform dev and test with the fitted vectorizers
Xw_cb_dev  = cb_word_vec.transform(cb_dev_text)
Xw_cb_test = cb_word_vec.transform(cb_test_text)

Xw_tx_dev  = tx_word_vec.transform(tx_dev_text)
Xw_tx_test = tx_word_vec.transform(tx_test_text)

# Save the fitted vectorizers so future notebooks can load them
joblib.dump(cb_word_vec, os.path.join(ARTIFACTS_DIR, "tfidf_word_cyberbullying.joblib"))
joblib.dump(tx_word_vec, os.path.join(ARTIFACTS_DIR, "tfidf_word_toxic.joblib"))
print("Saved vectorizers: tfidf_word_cyberbullying.joblib, tfidf_word_toxic.joblib")

# Quick sanity checks: show shapes (rows = examples, cols = vocab size)
print("[Cyberbullying] word TF–IDF shapes:")
print("  train:", Xw_cb_train.shape, "dev:", Xw_cb_dev.shape, "test:", Xw_cb_test.shape)

print("[Toxic] word TF–IDF shapes:")
print("  train:", Xw_tx_train.shape, "dev:", Xw_tx_dev.shape, "test:", Xw_tx_test.shape)


Saved vectorizers: tfidf_word_cyberbullying.joblib, tfidf_word_toxic.joblib
[Cyberbullying] word TF–IDF shapes:
  train: (33313, 27410) dev: (4759, 27410) test: (9518, 27410)
[Toxic] word TF–IDF shapes:
  train: (111699, 50000) dev: (15957, 50000) test: (31915, 50000)


#### Why word + char TF–IDF

I build word TF–IDF to capture clear semantics (single words and short phrases) and character TF–IDF to catch obfuscation and style (misspellings, elongations, punctuation runs). Together, they cover both what is said and how it is written. This matters for my hypotheses: H1 (style differences), H2 (domain norms), and H4 (severity × target). For example, word TF–IDF surfaces slurs and violence verbs (severity), while char TF–IDF still fires when users mask them (“f*ck”, “idi0t”).

TF–IDF also supports fair, reproducible evaluation. I fit the vectorizers on train only, so I don’t leak information into dev/test. The resulting sparse matrices let me train simple linear models and inspect coefficients to check for identity-term bias (H3): I can see if the model leans too hard on a group word and then fix that in features or thresholds. Finally, TF–IDF gives me a shared backbone for both tasks—multiclass Twitter and multilabel Wikipedia—so I can compare precision/recall behavior (H1), examine cross-domain drift (H2), and run clean-vs-toxic analyses.

#### Character TF–IDF
I added character n-gram TF–IDF because people often hide or vary abusive words with spelling tricks and emphasis. Examples: f*ck, fuuuck, idi0t, bi—tch, or long runs of !!!. Word features miss these if the exact token is new, but character 3–5-grams still fire on the shared subpieces (fuc, uck, idi, i0t). Char n-grams also catch style signals that live in the stream itself (elongations, punctuation, casing) rather than in dictionary words. This makes the representation more robust and complements word TF–IDF (which gives clean semantics).

For preprocessing I chose slightly different inputs:

- Cyberbullying: tweet_text_clean, which keeps the natural character stream I care about (repeats, punctuation) but removes junk that would dominate the signal (URLs) and normalizes mentions to a single @user.

- Jigsaw: raw comment_text. On Wikipedia I didn’t lemmatize; the raw comments already contain the punctuation and elongations that char n-grams need, and URLs are less dominant there.

Settings:

- ngram_range=(3,5) to balance detail and noise. Unigrams/bigrams at the character level are too generic (they light up everywhere: th, he, in). Very long n-grams explode the feature space and act like brittle exact matches.

- min_df=5 to ignore character sequences that occur in only a handful of comments; that removes one-off typos and reduces overfitting.

- max_features=30_000 to keep memory and training time reasonable; word TF–IDF already carries the heavy semantic load, so the char block only needs to cover the most common subword patterns.

I fit the char vectorizers on the train split only to avoid leaking information from dev/test into the vocabulary, then I transformed dev and test with the same fitted vectorizer so all splits live in the exact same feature space.

In [None]:
CB_CHAR_COL = "tweet_text_clean"
TX_CHAR_COL = "comment_text"

cb_train_char = cb_train[CB_CHAR_COL]
cb_dev_char   = cb_dev[CB_CHAR_COL]
cb_test_char  = cb_test[CB_CHAR_COL]

tx_train_char = tx_train[TX_CHAR_COL]
tx_dev_char   = tx_dev[TX_CHAR_COL]
tx_test_char  = tx_test[TX_CHAR_COL]


# Build the character TF–IDF vectorizers
cb_char_vec = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=5, max_features=30_000)
tx_char_vec = TfidfVectorizer(analyzer="char", ngram_range=(3,5), min_df=5, max_features=30_000)

# Fit on train only
Xc_cb_train = cb_char_vec.fit_transform(cb_train_char)
Xc_tx_train = tx_char_vec.fit_transform(tx_train_char)

# Transform dev and test with the same fitted vectorizers
Xc_cb_dev  = cb_char_vec.transform(cb_dev_char)
Xc_cb_test = cb_char_vec.transform(cb_test_char)

Xc_tx_dev  = tx_char_vec.transform(tx_dev_char)
Xc_tx_test = tx_char_vec.transform(tx_test_char)

# Save the fitted vectorizers
joblib.dump(cb_char_vec, os.path.join(ARTIFACTS_DIR, "tfidf_char_cyberbullying.joblib"))
joblib.dump(tx_char_vec, os.path.join(ARTIFACTS_DIR, "tfidf_char_toxic.joblib"))
print("Saved vectorizers: tfidf_char_cyberbullying.joblib, tfidf_char_toxic.joblib")

# Sanity-check shapes
print("[Cyberbullying] char TF–IDF shapes:")
print("  train:", Xc_cb_train.shape, "dev:", Xc_cb_dev.shape, "test:", Xc_cb_test.shape)

In [None]:
PROFANITY  = {"fuck","fucking","shit","bitch","cunt","ass","dick","suck","sucks","moron","stupid"}
VIOLENCE   = {"kill","die","murder","hurt","destroy","stab","shoot"}
PRON_2ND   = {"you","your","u","ur","you're","youre"}
INTENS     = {"very","really","so","extremely","totally","absolutely","super"}
POLITE     = {"please","thank","thanks","appreciate"}
NEGATE     = {
    "not","no","never",
    "don't","dont","can't","cant","won't","wont","isn't","isnt","didn't","didnt",
    "doesn't","doesnt","shouldn't","shouldnt","couldn't","couldnt","wouldn't","wouldnt",
    "ain't","aint","n't"
}
IDENTITY   = {"jew","jewish","muslim","christian","black","white","gay","lesbian","mexican","mexicans"}
GENERALIZ  = {"these","those","all","every","always","everyone","nobody"}
HEDGES_UNI = {"maybe","perhaps","seems","apparently","probably","possibly","likely"}
HEDGE_PHRASES = {"i think","i guess","sort of","kind of"}

URL_RE     = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
MENTION_RE = re.compile(r"@\w+")
HASHTAG_RE = re.compile(r"#(\w+)")

def _tokens(series: pd.Series):
    return series.fillna("").astype(str).str.split()

def _rate_per_k(tokens, lexicon, per=1000, min_tokens=20):
    """
    Per-1k rate with length smoothing: treat very short messages as if they had
    at least `min_tokens` tokens to avoid huge spikes like 333 when n=3.
    """
    n = max(len(tokens), min_tokens)
    return 0.0 if n == 0 else per * sum(1 for t in tokens if t in lexicon) / n

def _has_any(tokens, lexicon):
    return int(any(t in lexicon for t in tokens))

def _has_any_bigram(tokens, phrases):
    s = " ".join(tokens)
    return int(any(p in s for p in phrases))

def _raw_counts(text):
    s = "" if not isinstance(text, str) else text
    had_url      = int(bool(URL_RE.search(s)))
    had_mention  = int(bool(MENTION_RE.search(s)))
    hashtag_count= len(HASHTAG_RE.findall(s))
    bangs        = s.count("!")
    qmarks       = s.count("?")
    return had_url, had_mention, hashtag_count, bangs, qmarks

def build_psych_features_smooth(
    df: pd.DataFrame, tokens_col: str, raw_col: str, per: int = 1000, min_tokens: int = 20
) -> pd.DataFrame:
    toks_series = _tokens(df[tokens_col])

    rows = []
    for i, ts in enumerate(toks_series):
        # per-1k (smoothed) lexicon rates
        rate_profane   = _rate_per_k(ts, PROFANITY,  per=per, min_tokens=min_tokens)
        rate_violence  = _rate_per_k(ts, VIOLENCE,   per=per, min_tokens=min_tokens)
        rate_2nd       = _rate_per_k(ts, PRON_2ND,   per=per, min_tokens=min_tokens)
        rate_intens    = _rate_per_k(ts, INTENS,     per=per, min_tokens=min_tokens)
        rate_negate    = _rate_per_k(ts, NEGATE,     per=per, min_tokens=min_tokens)
        rate_polite    = _rate_per_k(ts, POLITE,     per=per, min_tokens=min_tokens)
        rate_identity  = _rate_per_k(ts, IDENTITY,   per=per, min_tokens=min_tokens)
        rate_general   = _rate_per_k(ts, GENERALIZ,  per=per, min_tokens=min_tokens)
        rate_hedge     = _rate_per_k(ts, HEDGES_UNI, per=per, min_tokens=min_tokens)

        # flags + controls
        has_profane    = _has_any(ts, PROFANITY)
        has_identity   = _has_any(ts, IDENTITY)
        has_hedge_bigram = _has_any_bigram(ts, HEDGE_PHRASES)

        n_tokens    = len(ts)
        n_smooth    = max(n_tokens, min_tokens)
        avg_tok_len = float(np.mean([len(t) for t in ts])) if n_tokens > 0 else 0.0

        raw = df.iloc[i][raw_col] if raw_col in df.columns else ""
        had_url, had_mention, hashtag_count, bangs, qmarks = _raw_counts(raw)

        # per-1k (smoothed) normalized raw signals
        rate_bangs     = per * bangs / n_smooth
        rate_qmarks    = per * qmarks / n_smooth
        rate_hashtags  = per * hashtag_count / n_smooth

        rows.append({
            # rates (lexicons)
            "rate_profane":  rate_profane,
            "rate_violence": rate_violence,
            "rate_2nd":      rate_2nd,
            "rate_intens":   rate_intens,
            "rate_negate":   rate_negate,
            "rate_polite":   rate_polite,
            "rate_identity": rate_identity,
            "rate_general":  rate_general,
            "rate_hedge":    rate_hedge,
            # rates (normalized raw)
            "rate_bangs":    rate_bangs,
            "rate_qmarks":   rate_qmarks,
            "rate_hashtags": rate_hashtags,
            # flags + controls
            "has_profane":   has_profane,
            "has_identity":  has_identity,
            "has_hedge_bigram": has_hedge_bigram,
            "n_tokens":      n_tokens,
            "avg_tok_len":   avg_tok_len,
            "had_url":       had_url,
            "had_mention":   had_mention,
        })

    cols = [
        "rate_profane","rate_violence","rate_2nd","rate_intens","rate_negate","rate_polite",
        "rate_identity","rate_general","rate_hedge",
        "rate_bangs","rate_qmarks","rate_hashtags",
        "has_profane","has_identity","has_hedge_bigram",
        "n_tokens","avg_tok_len","had_url","had_mention"
    ]
    feats = pd.DataFrame(rows, index=df.index)[cols]
    return feats

In [None]:
# Build features for each split

cb_psy_tr = build_psych_features_smooth(cb_train, tokens_col="lemmatized_text", raw_col="tweet_text")
cb_psy_dv = build_psych_features_smooth(cb_dev,   tokens_col="lemmatized_text", raw_col="tweet_text")
cb_psy_te = build_psych_features_smooth(cb_test,  tokens_col="lemmatized_text", raw_col="tweet_text")

tx_psy_tr = build_psych_features_smooth(tx_train, tokens_col="simple_tokens",   raw_col="comment_text")
tx_psy_dv = build_psych_features_smooth(tx_dev,   tokens_col="simple_tokens",   raw_col="comment_text")
tx_psy_te = build_psych_features_smooth(tx_test,  tokens_col="simple_tokens",   raw_col="comment_text")

In [None]:
print(tx_psy_tr.filter(like="rate_").mean().round(2))  

In [None]:
def scale_cues_train_dev_test(tr_df, dv_df, te_df, name_prefix):
    """
    Fit MaxAbsScaler on train cue features (DataFrame), transform dev/test,
    convert to CSR sparse, and persist scaler + matrices.
    """
    scaler = MaxAbsScaler()
    tr_scaled = scaler.fit_transform(tr_df.values)
    dv_scaled = scaler.transform(dv_df.values)
    te_scaled = scaler.transform(te_df.values)

    X_tr = csr_matrix(tr_scaled)
    X_dv = csr_matrix(dv_scaled)
    X_te = csr_matrix(te_scaled)

    # Save scaler + split-aligned matrices
    joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, f"{name_prefix}_scaler.joblib"))
    joblib.dump({"train": X_tr, "dev": X_dv, "test": X_te},
                os.path.join(ARTIFACTS_DIR, f"{name_prefix}_scaled.joblib"))

    # Quick sanity: show column count and a few norms (should be <= 1 after MaxAbs)
    colmax_train = np.abs(tr_scaled).max(axis=0)
    print(f"[{name_prefix}] cols: {tr_df.shape[1]}, max abs per-col (train) ≤ 1: {np.all(colmax_train <= 1.000001)}")
    return X_tr, X_dv, X_te

# Run for both corpora
Xp_cb_tr, Xp_cb_dv, Xp_cb_te = scale_cues_train_dev_test(cb_psy_tr, cb_psy_dv, cb_psy_te, name_prefix="cb_cues")
Xp_tx_tr, Xp_tx_dv, Xp_tx_te = scale_cues_train_dev_test(tx_psy_tr, tx_psy_dv, tx_psy_te, name_prefix="tx_cues")

In [None]:
X_cb_train = hstack([Xw_cb_train, Xc_cb_train, Xp_cb_tr], format="csr")
X_cb_dev   = hstack([Xw_cb_dev,   Xc_cb_dev,   Xp_cb_dv], format="csr")
X_cb_test  = hstack([Xw_cb_test,  Xc_cb_test,  Xp_cb_te], format="csr")

joblib.dump({"train": X_cb_train, "dev": X_cb_dev, "test": X_cb_test},
            os.path.join(ARTIFACTS_DIR, "cb_word_char_cues.joblib"), compress=3)
print("[SAVED] artifacts/cb_word_char_cues.joblib",
      "→", X_cb_train.shape, X_cb_dev.shape, X_cb_test.shape)

In [None]:
X_tx_train = hstack([Xw_tx_train, Xc_tx_train, Xp_tx_tr], format="csr")
X_tx_dev   = hstack([Xw_tx_dev,   Xc_tx_dev,   Xp_tx_dv], format="csr")
X_tx_test  = hstack([Xw_tx_test,  Xc_tx_test,  Xp_tx_te], format="csr")

joblib.dump({"train": X_tx_train, "dev": X_tx_dev, "test": X_tx_test},
            os.path.join(ARTIFACTS_DIR, "tx_word_char_cues.joblib"), compress=3)
print("[SAVED] artifacts/tx_word_char_cues.joblib",
      "→", X_tx_train.shape, X_tx_dev.shape, X_tx_test.shape)

In [None]:
y_cb_bin = {
    "train": cb_train["is_bullying"].astype(int).values,
    "dev":   cb_dev["is_bullying"].astype(int).values,
    "test":  cb_test["is_bullying"].astype(int).values,
}

In [None]:
# Multiclass codes (0..K-1) + mapping for reproducibility
cats = cb_train["cyberbullying_type"].astype("category").cat.categories
# Use a consistent global mapping across splits:
all_cats = cb_train["cyberbullying_type"].astype("category").cat.categories
code_map = {cat: i for i, cat in enumerate(all_cats)}


def _codes(s): return np.array([code_map[v] for v in s])
    
y_cb_mc = {
    "train": _codes(cb_train["cyberbullying_type"]),
    "dev":   _codes(cb_dev["cyberbullying_type"]),
    "test":  _codes(cb_test["cyberbullying_type"]),
}

joblib.dump(
    {"binary": y_cb_bin, "multiclass": y_cb_mc, "mapping": code_map},
    os.path.join(ARTIFACTS_DIR, "cb_labels.joblib"),
)
print("[SAVED] artifacts/cb_labels.joblib",
      f"→ binary sizes {[len(y) for y in y_cb_bin.values()]} ; multiclass sizes {[len(y) for y in y_cb_mc.values()]}")

In [None]:
TX_LABELS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

Y_tx_train = tx_train[TX_LABELS].astype(int).to_numpy()
Y_tx_dev   = tx_dev[TX_LABELS].astype(int).to_numpy()
Y_tx_test  = tx_test[TX_LABELS].astype(int).to_numpy()

In [None]:
# Save multi-label Y matrices
Y_tx = {"train": Y_tx_train, "dev": Y_tx_dev, "test": Y_tx_test}
joblib.dump(Y_tx, os.path.join(ARTIFACTS_DIR, "tx_multilabel.joblib"))
print("[SAVED] artifacts/tx_multilabel.joblib",
      f"→ shapes {[Y_tx[k].shape for k in ['train','dev','test']]}")

In [None]:
CUE_NAMES_CB = cb_psy_tr.columns.tolist()
CUE_NAMES_TX = tx_psy_tr.columns.tolist()

joblib.dump(CUE_NAMES_CB, os.path.join(ARTIFACTS_DIR, "cb_cue_columns.joblib"))
joblib.dump(CUE_NAMES_TX, os.path.join(ARTIFACTS_DIR, "tx_cue_columns.joblib"))

In [None]:
TX_LABELS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

joblib.dump(TX_LABELS, ARTIFACTS_DIR / "tx_label_order.joblib")
print("Saved →", ARTIFACTS_DIR / "tx_label_order.joblib")