In [None]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

from catboost import CatBoostClassifier

In [3]:
seed = 42

root_path = "/home/stefan/ioai-prep/kits/roai-2025/toxic"

In [4]:
nltk.download("stopwords", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("wordnet", quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Data preparation

In [None]:
def clean_text(text: str):
    text = text.lower()

    # remove short forms
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"im", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)

    text = re.sub(r"http\S+|www\S+|https\S+", ' WEB ', text) # URLs
    text = re.sub(r"@\w+|#\w+", ' USER ', text) # mentions and hashtags
    text = re.sub(r"<.*?>", "", text)  # HTML tags
    text = text.translate(str.maketrans("", "", string.punctuation))  # punctuation
    text = re.sub(r"\d+", "", text)  # numbers
    text = re.sub(r"\s+", " ", text).strip()  # extra whitespace

    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

    return ' '.join(tokens)

def prep_df(df: pd.DataFrame):
    df = df.drop(["id"], errors='ignore')

    X = df["comment_text"].apply(clean_text)

    if "toxic" in df.columns:
        y = df[['toxic', 'severe_toxic', 'obscene', 'insult']]
        return X, y
    return X

In [6]:
df = pd.read_csv(f"{root_path}/train_data.csv")
X, y = prep_df(df)

In [7]:
X.head()

0    christmas family come together celebrate jesus...
1    afraid wrong never admin wikipedia unfortuantl...
2    orange infobox hieroglyph picture informaton p...
3    completed item done item take serious look fl ...
4    contested deletion article speedy deleted rece...
Name: comment_text, dtype: object

# Model selection

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y["severe_toxic"], random_state=seed
)

In [None]:
def find_best_threshold(model, label):
    # 1. get probabilities
    y_proba_test = model.predict_proba(X_test)[:, 1]

    # use the precision-recall curve to compute the optimal threshold
    precision, recall, thresh = precision_recall_curve(y_test[label], y_proba_test)
    f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
    best_idx = np.argmax(f1_scores)
    best_thresh = thresh[best_idx]

    return best_thresh

In [None]:
vectorizer = TfidfVectorizer(max_features=200000, analyzer='word', ngram_range=(1, 2), sublinear_tf=True)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
catboost_params = {
    "toxic": {
        "iterations": 600,
        "learning_rate": 2e-1,
        "depth": 6,
    },
    "severe_toxic": {
        "iterations": 200,
        "learning_rate": 2e-1,
    },
    "obscene": {
        "iterations": 300,
        "learning_rate": 5e-1,
    },
    "insult": {
        "iterations": 1200,
        "learning_rate": 2e-1,
    },
}

In [None]:
# train one model per label
models, scores, thresholds = {}, {}, {}

for label in catboost_params.keys():
    print(f"\nTraining model for: {label}")

    # 1. train the model
    model = CatBoostClassifier(
        **catboost_params[label], 
        eval_metric='F1', 
        # used to avoid overfitting
        early_stopping_rounds=50,
        random_seed=seed
    )

    model.fit(X_train, y_train[label], eval_set=(X_test, y_test[label]))

    # 2. tune it - find optimal threshold for maximum F1 score
    best_thresh = find_best_threshold(model, label)

    y_proba_test = model.predict_proba(X_test)[:, 1]
    y_pred_test = (y_proba_test >= best_thresh).astype(int)
    f1_test = f1_score(y_test[label], y_pred_test, average="binary")

    print(f"F1 scores for {label}: f1_test={f1_test:.4f}; best_thresh={best_thresh:.3f}")

    # 3. save the model for the label
    models[label] = model
    scores[label] = f1_test
    thresholds[label] = best_thresh


Training model for: toxic
0:	learn: 0.4986545	test: 0.4839623	best: 0.4839623 (0)	total: 551ms	remaining: 5m 30s
1:	learn: 0.4987718	test: 0.4846770	best: 0.4846770 (1)	total: 970ms	remaining: 4m 49s
2:	learn: 0.5303238	test: 0.5169569	best: 0.5169569 (2)	total: 1.37s	remaining: 4m 33s
3:	learn: 0.5107023	test: 0.4972119	best: 0.5169569 (2)	total: 1.78s	remaining: 4m 26s
4:	learn: 0.5603650	test: 0.5429338	best: 0.5429338 (4)	total: 2.21s	remaining: 4m 22s
5:	learn: 0.5528765	test: 0.5425101	best: 0.5429338 (4)	total: 2.6s	remaining: 4m 17s
6:	learn: 0.5775236	test: 0.5606732	best: 0.5606732 (6)	total: 3.01s	remaining: 4m 15s
7:	learn: 0.5927695	test: 0.5746824	best: 0.5746824 (7)	total: 3.43s	remaining: 4m 13s
8:	learn: 0.5938789	test: 0.5750547	best: 0.5750547 (8)	total: 3.84s	remaining: 4m 12s
9:	learn: 0.5938315	test: 0.5797733	best: 0.5797733 (9)	total: 4.24s	remaining: 4m 10s
10:	learn: 0.5990474	test: 0.5844551	best: 0.5844551 (10)	total: 4.63s	remaining: 4m 8s
11:	learn: 0.603

In [None]:
# F1 scores needed for max score: 0.9 | 0.35 | 0.75 | 0.71

print(scores)
sum(scores.values())/4

{'toxic': 0.8021836865767502, 'severe_toxic': 0.48157248157248156, 'obscene': 0.813503043718871, 'insult': 0.7324613555291319}


0.7074301418493087

# Submission

In [25]:
df_test = pd.read_csv(f"{root_path}/test_data.csv")

features = prep_df(df_test)
features = vectorizer.transform(features)

In [None]:
subtask1_preds = np.zeros((features.shape[0], len(models)), dtype=int)

for i, label in enumerate(["toxic", "severe_toxic", "obscene", "insult"]):
    proba = models[label].predict_proba(features)[:, 1]
    subtask1_preds[:, i] = (proba >= thresholds[label]).astype(int)

subtask1 = subtask1_preds

In [27]:
submission = pd.DataFrame({
    "datapointID": df_test["id"],
    "answer": subtask1.tolist(),
    "subtaskID": 1
})

submission.head()

Unnamed: 0,datapointID,answer,subtaskID
0,00091c35fa9d0465,"[1, 0, 0, 0]",1
1,0071940212267fea,"[1, 0, 1, 0]",1
2,0072b9c3697ab8cc,"[1, 0, 0, 1]",1
3,0081b14d79f54b31,"[1, 0, 1, 1]",1
4,00950f0fae33869f,"[1, 0, 1, 1]",1


In [28]:
submission.to_csv("submission.csv", index=False)