In [28]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

os.makedirs("models", exist_ok=True)

print("Библиотеки загружены.")


Библиотеки загружены.


In [None]:
df = pd.read_csv("data/news_dataset.csv")

print("Размер датасета:", df.shape)
print("\nУникальные категории:")
print(df["category"].unique())

df.head()

In [None]:
plt.figure(figsize=(10,6))
df["category"].value_counts().plot(kind="bar")
plt.title("Распределение категорий")
plt.ylabel("Количество")
plt.show()

In [None]:
X = df["text"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)

In [None]:
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    min_df=3
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Размер обучающей матрицы:", X_train_vec.shape)


In [None]:
models = {
    "LinearSVC": LinearSVC(),
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "MultinomialNB": MultinomialNB()
}

results = {}

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

results

In [None]:
plt.figure(figsize=(8,5))
plt.bar(results.keys(), results.values())
plt.title("Сравнение моделей")
plt.ylabel("Accuracy")
plt.show()

In [None]:
best_model_name = max(results, key=results.get)
print("Лучшая модель:", best_model_name)

best_model = models[best_model_name]


In [None]:
y_pred = best_model.predict(X_test_vec)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
joblib.dump(best_model, "models/model2.pkl")
joblib.dump(vectorizer, "models/vectorizer2.pkl")

print("Модель сохранена.")


In [None]:
metrics_df = pd.DataFrame({
    "model": results.keys(),
    "accuracy": results.values()
})

metrics_df.to_csv("models/metrics_history.csv", index=False)

metrics_df
