In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [3]:
TARGET = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

In [4]:
train_df, val_df  = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from turtle import pd
import pandas as pd
import numpy as np
from utils import *
from matplotlib import pyplot as plt
from src.find_optimal_threshold import find_optimal_threshold


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch



tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")

device = torch.device("cuda")
print(f"Using device: {device}")
distilbert_model = distilbert_model.to(device)
distilbert_model.eval()

def bert_embed(texts, batch_size=32):
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = distilbert_model(**inputs)
        
        # CLS токен 
        batch_embeddings = (outputs.last_hidden_state[:, 0, :].cpu().numpy())

        embeddings.append(batch_embeddings)
        
        # Прогресс
        if (i // batch_size) % 50 == 0:
            print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} texts")
    
    return np.vstack(embeddings)

print("BERT + Logistic Regression модель")

print("\nПолучение BERT эмбеддингов для обучающего набора...")
X = bert_embed(train_df['comment_text'].tolist(), batch_size=32)

print("\nОбучение классификатора...")
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='liblinear'))
clf.fit(X, train_df[TARGET])

print("\nПолучение BERT эмбеддингов для валидационного набора...")
X_val = bert_embed(val_df['comment_text'].tolist(), batch_size=32)

val_probs = clf.predict_proba(X_val)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2026-01-18 23:42:24.866757: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768779745.052145      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768779745.109924      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768779745.569433      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768779745.569478      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768779745.569481      55 computation_placer.cc:177] computation placer alr

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Using device: cuda
BERT + Logistic Regression модель

Получение BERT эмбеддингов для обучающего набора...
Processed 32/127656 texts
Processed 1632/127656 texts
Processed 3232/127656 texts
Processed 4832/127656 texts
Processed 6432/127656 texts
Processed 8032/127656 texts
Processed 9632/127656 texts
Processed 11232/127656 texts
Processed 12832/127656 texts
Processed 14432/127656 texts
Processed 16032/127656 texts
Processed 17632/127656 texts
Processed 19232/127656 texts
Processed 20832/127656 texts
Processed 22432/127656 texts
Processed 24032/127656 texts
Processed 25632/127656 texts
Processed 27232/127656 texts
Processed 28832/127656 texts
Processed 30432/127656 texts
Processed 32032/127656 texts
Processed 33632/127656 texts
Processed 35232/127656 texts
Processed 36832/127656 texts
Processed 38432/127656 texts
Processed 40032/127656 texts
Processed 41632/127656 texts
Processed 43232/127656 texts
Processed 44832/127656 texts
Processed 46432/127656 texts
Processed 48032/127656 texts
Proc

In [33]:
print("\n" + "="*80)
print("Метрики BERT модели:")
print("="*80)
best_thresholds, best_scores = find_optimal_threshold(val_df[TARGET].values, val_probs)
print("\nМетрики с оптимальными порогами:")
metrics_model(val_df[TARGET], val_probs, thresholds=best_thresholds)
print(f'\nValidation ROC-AUC Score (BERT): {roc_auc_score(val_df[TARGET], val_probs):.4f}')


Метрики BERT модели:

Метрики с оптимальными порогами:
--------------------------------
F1 Score (macro): 0.6042
F1 Score (micro): 0.7132
Accuracy: 0.9082
Precision: 0.3975
Recall: 0.6273
--------------------------------
              precision    recall  f1-score   support

           0       0.77      0.74      0.76      3056
           1       0.42      0.66      0.51       321
           2       0.80      0.72      0.76      1715
           3       0.46      0.39      0.42        74
           4       0.66      0.73      0.70      1614
           5       0.45      0.51      0.48       294

   micro avg       0.71      0.72      0.71      7074
   macro avg       0.59      0.63      0.60      7074
weighted avg       0.72      0.72      0.72      7074
 samples avg       0.06      0.07      0.06      7074


Validation ROC-AUC Score (BERT): 0.9777


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
test_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [10]:
print("Получение BERT эмбеддингов для test набора...")
X_test = bert_embed(test_df["comment_text"].tolist())

print("Предсказание вероятностей...")
test_probs = clf.predict_proba(X_test)


Получение BERT эмбеддингов для test набора...
Processed 32/153164 texts
Processed 1632/153164 texts
Processed 3232/153164 texts
Processed 4832/153164 texts
Processed 6432/153164 texts
Processed 8032/153164 texts
Processed 9632/153164 texts
Processed 11232/153164 texts
Processed 12832/153164 texts
Processed 14432/153164 texts
Processed 16032/153164 texts
Processed 17632/153164 texts
Processed 19232/153164 texts
Processed 20832/153164 texts
Processed 22432/153164 texts
Processed 24032/153164 texts
Processed 25632/153164 texts
Processed 27232/153164 texts
Processed 28832/153164 texts
Processed 30432/153164 texts
Processed 32032/153164 texts
Processed 33632/153164 texts
Processed 35232/153164 texts
Processed 36832/153164 texts
Processed 38432/153164 texts
Processed 40032/153164 texts
Processed 41632/153164 texts
Processed 43232/153164 texts
Processed 44832/153164 texts
Processed 46432/153164 texts
Processed 48032/153164 texts
Processed 49632/153164 texts
Processed 51232/153164 texts
Proces

In [11]:

submission = pd.DataFrame(
    test_probs,
    columns=TARGET
)

submission.insert(0, "id", test_df["id"].values)


submission.to_csv(
    "submission.csv",
    index=False,
    float_format="%.6f"
)

submission.head()


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.99437,0.272208,0.94644,0.088073,0.93929,0.539226
1,0000247867823ef7,0.017814,0.000922,0.017004,0.000234,0.005657,0.000184
2,00013b17ad220c46,0.015578,0.000518,0.009791,8e-06,0.004724,0.002022
3,00017563c3f7919a,9.1e-05,4e-06,8e-06,0.000172,3e-05,2e-06
4,00017695ad8997eb,0.004044,0.000157,0.002619,3.7e-05,0.000519,0.000104


In [12]:
import os
import joblib
import json


EXPORT_DIR = "/kaggle/working/model_export"
os.makedirs(EXPORT_DIR, exist_ok=True)


BERT_DIR = os.path.join(EXPORT_DIR, "distilbert_embedder")

tokenizer.save_pretrained(BERT_DIR)
distilbert_model.save_pretrained(BERT_DIR)
joblib.dump(clf, os.path.join(EXPORT_DIR, "logreg_head.joblib"))


['/kaggle/working/model_export/logreg_head.joblib']