# Лабораторная работа №3.1: Классификация текстов

## Загрузка

In [3]:
import numpy as np
import time
from datasets import load_dataset
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


In [None]:
def load_embeddings(path):
    doc_ids = []
    embeddings = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            doc_ids.append(parts[0])
            embeddings.append(list(map(float, parts[1:])))

    return doc_ids, np.array(embeddings)


In [None]:
def load_document_vectors(path):
    embeddings = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            vector = list(map(float, parts[1:]))
            embeddings.append(vector)

    return np.array(embeddings)


In [None]:
embeddings = load_document_vectors("document_vectors.tsv")

print(embeddings.shape)


(100, 100)


In [None]:
dataset = load_dataset("wangrongsheng/ag_news")
labels = np.array(dataset["train"]["label"])


README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]



data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [13]:
labels = labels[:embeddings.shape[0]]
print("Embeddings:", embeddings.shape[0])
print("Labels:", labels.shape[0])

Embeddings: 100
Labels: 100


## Метрики

In [None]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def precision_recall_f1(y_true, y_pred, num_classes):
    precisions = []
    recalls = []
    f1s = []

    for cls in range(num_classes):
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))

        precision = tp / (tp + fp + 1e-9)
        recall = tp / (tp + fn + 1e-9)
        f1 = 2 * precision * recall / (precision + recall + 1e-9)

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    return (
        np.mean(precisions),
        np.mean(recalls),
        np.mean(f1s)
    )


## Эксперименты с SVM

In [None]:
def run_experiment(X, y, kernel, max_iter):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = SVC(
        kernel=kernel,
        max_iter=max_iter
    )

    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    y_pred = model.predict(X_test)

    acc = accuracy(y_test, y_pred)
    p, r, f1 = precision_recall_f1(y_test, y_pred, num_classes=4)

    return {
        "kernel": kernel,
        "max_iter": max_iter,
        "accuracy": acc,
        "precision": p,
        "recall": r,
        "f1": f1,
        "training_time": training_time
    }


In [14]:
kernels = ["linear", "rbf"]
iterations = [200, 500, 1000]

results = []

for kernel in kernels:
    for it in iterations:
        res = run_experiment(embeddings, labels, kernel, it)
        results.append(res)
        print(res)


{'kernel': 'linear', 'max_iter': 200, 'accuracy': 0.85, 'precision': 0.3472222221473765, 'recall': 0.31862745093877354, 'f1': 0.3285714282734694, 'training_time': 0.0036818981170654297}
{'kernel': 'linear', 'max_iter': 500, 'accuracy': 0.85, 'precision': 0.3472222221473765, 'recall': 0.31862745093877354, 'f1': 0.3285714282734694, 'training_time': 0.0009968280792236328}
{'kernel': 'linear', 'max_iter': 1000, 'accuracy': 0.85, 'precision': 0.3472222221473765, 'recall': 0.31862745093877354, 'f1': 0.3285714282734694, 'training_time': 0.0009975433349609375}
{'kernel': 'rbf', 'max_iter': 200, 'accuracy': 0.85, 'precision': 0.212499999989375, 'recall': 0.24999999998529412, 'f1': 0.22972972959313365, 'training_time': 0.0005142688751220703}
{'kernel': 'rbf', 'max_iter': 500, 'accuracy': 0.85, 'precision': 0.212499999989375, 'recall': 0.24999999998529412, 'f1': 0.22972972959313365, 'training_time': 0.0009963512420654297}
{'kernel': 'rbf', 'max_iter': 1000, 'accuracy': 0.85, 'precision': 0.212499

## Отбрасывание размерностей эмбеддингов

In [None]:
def drop_dimensions(X, drop_ratio):
    X_new = X.copy()
    dim = X.shape[1]
    drop_count = int(dim * drop_ratio)

    indices = np.random.choice(dim, drop_count, replace=False)
    X_new[:, indices] = 0.0

    return X_new


In [16]:
drop_ratios = [0.1, 0.3, 0.5, 0.7]

degradation_results = []

for ratio in drop_ratios:
    X_dropped = drop_dimensions(embeddings, ratio)
    res = run_experiment(
        X_dropped,
        labels,
        kernel="linear",
        max_iter=500
    )
    res["drop_ratio"] = ratio
    degradation_results.append(res)
    print(res)


{'kernel': 'linear', 'max_iter': 500, 'accuracy': 0.85, 'precision': 0.3472222221473765, 'recall': 0.31862745093877354, 'f1': 0.3285714282734694, 'training_time': 0.0009930133819580078, 'drop_ratio': 0.1}
{'kernel': 'linear', 'max_iter': 500, 'accuracy': 0.85, 'precision': 0.3472222221473765, 'recall': 0.31862745093877354, 'f1': 0.3285714282734694, 'training_time': 0.0014445781707763672, 'drop_ratio': 0.3}
{'kernel': 'linear', 'max_iter': 500, 'accuracy': 0.9, 'precision': 0.4736842102645429, 'recall': 0.3333333332908497, 'f1': 0.3611111108171296, 'training_time': 0.0015511512756347656, 'drop_ratio': 0.5}
{'kernel': 'linear', 'max_iter': 500, 'accuracy': 0.85, 'precision': 0.212499999989375, 'recall': 0.24999999998529412, 'f1': 0.22972972959313365, 'training_time': 0.0004963874816894531, 'drop_ratio': 0.7}
