In [21]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [22]:
TARGET = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

In [23]:
train_df, val_df  = train_test_split(df, test_size=0.2, random_state=42)

In [30]:
from turtle import pd
import pandas as pd
from sklearn.metrics import average_precision_score, f1_score, accuracy_score, precision_score, recall_score, classification_report
import numpy as np
from matplotlib import pyplot as plt


def metrics_model(y_true, y_prob, thresholds=0.5):
    y_pred = applay_thresholds(y_prob, thresholds)

    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    precision = average_precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')

    print("--------------------------------")
    print(f"F1 Score (macro): {f1_macro:.4f}")
    print(f"F1 Score (micro): {f1_micro:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("--------------------------------")
    print(classification_report(y_true, y_pred))
    
    # return f1_macro, accuracy, precision, recall


def applay_thresholds(y_prob, thresholds):
    if not isinstance(thresholds, list):
        y_pred = y_prob > thresholds
        return y_pred.astype(int)

    y_pred = np.zeros(y_prob.shape)
    for i, threshold in enumerate(thresholds):
        y_pred[:, i] = y_prob[:, i] > threshold
    return y_pred.astype(int)


def get_top_tox(vectorizer, model, top_k=20):
    feature_names = np.array(vectorizer.get_feature_names_out())
    coefs = model.estimators_[0].coef_.flatten()

    top_toxic = feature_names[np.argsort(coefs)][-top_k:][::-1]
    top_non_toxic = feature_names[np.argsort(coefs)[:top_k]]

    print("TOXIC WORDS:")
    print(top_toxic)

    print("\nNON-TOXIC WORDS:")
    print(top_non_toxic)

    plt.figure(figsize=(8, 4))
    plt.barh(top_toxic, coefs[np.argsort(coefs)][-top_k:][::-1])
    plt.xlabel("Coefficient Toxicity Value")
    plt.title("Top Toxic Words")
    plt.show()


    # return top_toxic, top_non_toxic, coefs[np.argsort(coefs)][-top_k:][::-1], coefs[np.argsort(coefs)[:top_k]]



def explain_text(text, vectorizer, model, top_k=5):
    vec = vectorizer.transform([text])
    feature_names = np.array(vectorizer.get_feature_names_out())
    coefs = model.estimators_[0].coef_.flatten()

    contributions = vec.toarray()[0] * coefs
    idx = np.argsort(contributions)[-top_k:]

    print( pd.DataFrame({
        "word": feature_names[idx],
        "contribution": contributions[idx]
    }).sort_values("contribution", ascending=False)
    )

        

def word_contributions(text, vectorizer, model):
    clf = model.estimators_[0]

    words = text.lower().split()
    w = clf.coef_[0]

    contributions = {}

    for word in words:
        if word in vectorizer.model.wv:
            vec = vectorizer.model.wv[word]
            contributions[word] = float(np.dot(w, vec))

    print (dict(
        sorted(contributions.items(), key=lambda x: x[1], reverse=True)
    ))


In [25]:
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve
import numpy as np

def find_optimal_threshold(y_true, y_proba):
    """
    Найти оптимальный threshold для каждого класса
    
    metric: 'f1', 'balanced'
    """

    best_thresholds = np.zeros(y_proba.shape[1])
    best_scores = np.zeros(y_proba.shape[1])

    for i in range(y_proba.shape[1]):
        precision, recall, thresholds = precision_recall_curve(y_true[:, i], y_proba[:, i])
    
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
        idx = np.argmax(f1_scores)
        best_threshold = thresholds[idx] if idx < len(thresholds) else 0.5
        best_score = f1_scores[idx]
        
        best_thresholds[i] = best_threshold
        best_scores[i] = best_score

    return best_thresholds.tolist(), best_scores.tolist()



In [26]:
from transformers import AutoTokenizer, AutoModel
import torch



tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Использовать CPU (GPU может быть несовместима)
# Если хотите использовать GPU: device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")
print(f"Using device: {device}")
distilbert_model = distilbert_model.to(device)
distilbert_model.eval()

def bert_embed(texts, batch_size=32):
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = distilbert_model(**inputs)
        
        # CLS токен 
        batch_embeddings = (outputs.last_hidden_state[:, 0, :].cpu().numpy())

        embeddings.append(batch_embeddings)
        
        # Прогресс
        if (i // batch_size) % 50 == 0:
            print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} texts")
    
    return np.vstack(embeddings)

print("BERT + Logistic Regression модель")

print("\nПолучение BERT эмбеддингов для обучающего набора...")
X = bert_embed(train_df['comment_text'].tolist(), batch_size=32)

print("\nОбучение классификатора...")
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000, solver='liblinear'))
clf.fit(X, train_df[TARGET])

print("\nПолучение BERT эмбеддингов для валидационного набора...")
X_val = bert_embed(val_df['comment_text'].tolist(), batch_size=32)

val_probs = clf.predict_proba(X_val)


Using device: cuda

BERT + Logistic Regression модель

Получение BERT эмбеддингов для обучающего набора...
(Это займет некоторое время, так как используется CPU)
Processed 32/127656 texts
Processed 1632/127656 texts
Processed 3232/127656 texts
Processed 4832/127656 texts
Processed 6432/127656 texts
Processed 8032/127656 texts
Processed 9632/127656 texts
Processed 11232/127656 texts
Processed 12832/127656 texts
Processed 14432/127656 texts
Processed 16032/127656 texts
Processed 17632/127656 texts
Processed 19232/127656 texts
Processed 20832/127656 texts
Processed 22432/127656 texts
Processed 24032/127656 texts
Processed 25632/127656 texts
Processed 27232/127656 texts
Processed 28832/127656 texts
Processed 30432/127656 texts
Processed 32032/127656 texts
Processed 33632/127656 texts
Processed 35232/127656 texts
Processed 36832/127656 texts
Processed 38432/127656 texts
Processed 40032/127656 texts
Processed 41632/127656 texts
Processed 43232/127656 texts
Processed 44832/127656 texts
Proces

In [33]:
print("\n" + "="*80)
print("Метрики BERT модели:")
print("="*80)
best_thresholds, best_scores = find_optimal_threshold(val_df[TARGET].values, val_probs)
print("\nМетрики с оптимальными порогами:")
metrics_model(val_df[TARGET], val_probs, thresholds=best_thresholds)
print(f'\nValidation ROC-AUC Score (BERT): {roc_auc_score(val_df[TARGET], val_probs):.4f}')


Метрики BERT модели:

Метрики с оптимальными порогами:
--------------------------------
F1 Score (macro): 0.6039
F1 Score (micro): 0.7130
Accuracy: 0.9082
Precision: 0.3973
Recall: 0.6274
--------------------------------
              precision    recall  f1-score   support

           0       0.77      0.74      0.75      3056
           1       0.42      0.66      0.51       321
           2       0.80      0.72      0.76      1715
           3       0.46      0.39      0.42        74
           4       0.66      0.73      0.70      1614
           5       0.45      0.51      0.48       294

   micro avg       0.71      0.72      0.71      7074
   macro avg       0.59      0.63      0.60      7074
weighted avg       0.72      0.72      0.72      7074
 samples avg       0.06      0.07      0.06      7074


Validation ROC-AUC Score (BERT): 0.9777


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
