In [9]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve
import matplotlib.pyplot as plt

In [17]:
try:
    df = pd.read_csv('./data/labeled.csv')
except FileNotFoundError:
    print("Файл 'labeled.csv' не найден. Убедитесь, что он находится в той же папке.")
    exit()

nltk.download('stopwords')
stop_words = set(stopwords.words('russian'))
snowball = SnowballStemmer(language='russian')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^а-яa-z]', ' ', text)
    text = text.split()
    text = [snowball.stem(word) for word in text if word not in stop_words and len(word) > 1]
    return ' '.join(text)

df['comment'] = df['comment'].apply(clean_text)
X_train, X_test, y_train, y_test = train_test_split(
    df['comment'],
    df['toxic'],
    test_size=0.2,
    random_state=1337,
    stratify=df['toxic']
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kerio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
pipeline = Pipeline([
    ('features', TfidfVectorizer()),
    ('classifier', LGBMClassifier(random_state=1337, class_weight='balanced'))
])

param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.05, 0.1, 0.2],
    'classifier__num_leaves': [20, 31, 40]
}

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

print("Начинаю поиск по сетке...")
grid_search.fit(X_train, y_train)
print("Поиск завершен.")

print("Лучшие параметры:", grid_search.best_params_)
print("Лучший F1-Score на кросс-валидации:", grid_search.best_score_)

best_model = grid_search.best_estimator_

Начинаю поиск по сетке...
[LightGBM] [Info] Number of positive: 3861, number of negative: 7668
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35290
[LightGBM] [Info] Number of data points in the train set: 11529, number of used features: 1601
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Поиск завершен.
Лучшие параметры: {'classifier__learning_rate': 0.05, 'classifier__n_estimators': 200, 'classifier__num_leaves': 31}
Лучший F1-Score на кросс-валидации: 0.7162865575068126


In [19]:
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_f1_idx = np.nanargmax(f1_scores)
best_threshold = thresholds[best_f1_idx]

print(f"\nЛучший F1-Score на тестовом наборе: {f1_scores[best_f1_idx]:.4f}")
print(f"Оптимальный порог: {best_threshold:.4f}")

y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

final_precision = precision_score(y_test, y_pred_optimal)
final_recall = recall_score(y_test, y_pred_optimal)
final_f1 = f1_score(y_test, y_pred_optimal)

print(f"\nФинальная Precision: {final_precision:.4f}")
print(f"Финальный Recall: {final_recall:.4f}")
print(f"Финальный F1-Score: {final_f1:.4f}")


Лучший F1-Score на тестовом наборе: 0.7586
Оптимальный порог: 0.5522

Финальная Precision: 0.7272
Финальный Recall: 0.7927
Финальный F1-Score: 0.7586


