In [173]:
%pip install -q numpy scikit-learn pandas gensim

Note: you may need to restart the kernel to use updated packages.


In [174]:
import numpy as np
import os
import time
import json
import warnings
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from gensim.models import Word2Vec
import pandas as pd

In [175]:
lab1_path = "C:/Users/Paul/Projects/nlp-25/projects/pn-pren/lab1"
lab2_path = "C:/Users/Paul/Projects/nlp-25/projects/pn-pren/lab2"

embeddings_w2v = []
labels = []
doc_names = []

with open(os.path.join(lab2_path, "test_embeddings.tsv"), 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        doc_id = parts[0]
        vec = np.array([float(x) for x in parts[1:]])
        embeddings_w2v.append(vec)
        doc_names.append(doc_id)

embeddings_w2v = np.array(embeddings_w2v)
print(f"векторов w2v: {embeddings_w2v.shape}")

векторов w2v: (7600, 100)


In [176]:
# собираем метки для тестовой выборки
test_path = os.path.join(lab1_path, 'test')
class_names = ['Business', 'Sci_Tech', 'Sports', 'World']
class_to_id = {name: i for i, name in enumerate(class_names)}

test_labels = []
for class_name in class_names:
    class_path = os.path.join(test_path, class_name)
    files = sorted([f for f in os.listdir(class_path) if f.endswith('.tsv')])
    test_labels.extend([class_to_id[class_name]] * len(files))

test_labels = np.array(test_labels)
print(f"меток: {len(test_labels)}")

меток: 7600


In [177]:
train_path = os.path.join(lab1_path, 'train')
train_embeddings_file = os.path.join(lab2_path, "train_embeddings.npy")
train_labels_file = os.path.join(lab2_path, "train_labels.npy")

if os.path.exists(train_embeddings_file) and os.path.exists(train_labels_file):
    train_vecs_w2v = np.load(train_embeddings_file)
    train_labels = np.load(train_labels_file)
else:
    from gensim.models import Word2Vec
    
    w2v_model = Word2Vec.load(os.path.join(lab2_path, "word2vec.model"))
    
    def load_lemmas_from_tsv(file_path):
        lemmas = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 3:
                    lemmas.append(parts[2])
        return lemmas
    
    def vectorize_w2v(lemmas, model):
        vectors = [model.wv[l] for l in lemmas if l in model.wv]
        if not vectors:
            return np.zeros(model.vector_size)
        return np.mean(vectors, axis=0)
    
    train_vecs_w2v = []
    train_labels = []
    
    docs_per_class = 200
    np.random.seed(42)
    
    for class_name in class_names:
        class_path = os.path.join(train_path, class_name)
        all_files = [f for f in os.listdir(class_path) if f.endswith('.tsv')]
        selected_files = np.random.choice(all_files, size=docs_per_class, replace=False)
        
        for fname in selected_files:
            lemmas = load_lemmas_from_tsv(os.path.join(class_path, fname))
            train_vecs_w2v.append(vectorize_w2v(lemmas, w2v_model))
            train_labels.append(class_to_id[class_name])
    
    train_vecs_w2v = np.array(train_vecs_w2v)
    train_labels = np.array(train_labels)
    
    np.save(train_embeddings_file, train_vecs_w2v)
    np.save(train_labels_file, train_labels)

print(f"тест: {embeddings_w2v.shape}")
print(f"трейн: {train_vecs_w2v.shape}")

тест: (7600, 100)
трейн: (800, 100)


In [178]:
def calculate_metrics(y_true, y_pred, num_classes=4):
    precision_per_class = []
    recall_per_class = []
    f1_per_class = []
    
    for cls in range(num_classes):
        tp = ((y_true == cls) & (y_pred == cls)).sum()
        fp = ((y_true != cls) & (y_pred == cls)).sum()
        fn = ((y_true == cls) & (y_pred != cls)).sum()
        
        prec = tp / (tp + fp) if (tp + fp) > 0 else 0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
        
        precision_per_class.append(prec)
        recall_per_class.append(rec)
        f1_per_class.append(f1)
    
    precision = np.mean(precision_per_class)
    recall = np.mean(recall_per_class)
    f1 = np.mean(f1_per_class)
    accuracy = (y_true == y_pred).sum() / len(y_true)
    
    return precision, recall, f1, accuracy

In [179]:
results = []
max_iters = [100, 500, 1000]

print("SVM Linear:")
for max_iter in max_iters:
    start = time.time()
    clf = LinearSVC(max_iter=max_iter, random_state=42)
    clf.fit(train_vecs_w2v, train_labels)
    train_time = time.time() - start
    
    y_pred = clf.predict(embeddings_w2v)
    prec, rec, f1, acc = calculate_metrics(test_labels, y_pred)
    
    results.append({
        'model': 'SVM-linear',
        'vectors': 'w2v',
        'max_iter': max_iter,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'accuracy': acc,
        'train_time': train_time
    })
    
    print(f"iter={max_iter:4d} | prec={prec:.4f} | rec={rec:.4f} | f1={f1:.4f} | acc={acc:.4f} | time={train_time:.2f}s")

SVM Linear:
iter= 100 | prec=0.8420 | rec=0.8426 | f1=0.8421 | acc=0.8426 | time=0.01s
iter= 500 | prec=0.8420 | rec=0.8426 | f1=0.8421 | acc=0.8426 | time=0.01s
iter=1000 | prec=0.8420 | rec=0.8426 | f1=0.8421 | acc=0.8426 | time=0.01s


In [180]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [181]:
print("SVM RBF:")
for max_iter in [100, 500, 1000]:
    start = time.time()
    clf = SVC(kernel='rbf', max_iter=max_iter, random_state=42)
    clf.fit(train_vecs_w2v, train_labels)
    train_time = time.time() - start
    
    y_pred = clf.predict(embeddings_w2v)
    prec, rec, f1, acc = calculate_metrics(test_labels, y_pred)
    
    results.append({
        'model': 'SVM-rbf',
        'vectors': 'w2v',
        'max_iter': max_iter,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'accuracy': acc,
        'train_time': train_time
    })
    
    print(f"iter={max_iter:4d} | prec={prec:.4f} | rec={rec:.4f} | f1={f1:.4f} | acc={acc:.4f} | time={train_time:.2f}s")

SVM RBF:
iter= 100 | prec=0.8583 | rec=0.8586 | f1=0.8581 | acc=0.8586 | time=0.01s
iter= 500 | prec=0.8584 | rec=0.8588 | f1=0.8584 | acc=0.8588 | time=0.01s
iter=1000 | prec=0.8584 | rec=0.8588 | f1=0.8584 | acc=0.8588 | time=0.01s


In [182]:
print("\nSVM Poly:")
for max_iter in [100, 500, 1000]:
    start = time.time()
    clf = SVC(kernel='poly', degree=3, max_iter=max_iter, random_state=42)
    clf.fit(train_vecs_w2v, train_labels)
    train_time = time.time() - start
    
    y_pred = clf.predict(embeddings_w2v)
    prec, rec, f1, acc = calculate_metrics(test_labels, y_pred)
    
    results.append({
        'model': 'SVM-poly',
        'vectors': 'w2v',
        'max_iter': max_iter,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'accuracy': acc,
        'train_time': train_time
    })
    
    print(f"iter={max_iter:4d} | prec={prec:.4f} | rec={rec:.4f} | f1={f1:.4f} | acc={acc:.4f} | time={train_time:.2f}s")


SVM Poly:
iter= 100 | prec=0.8594 | rec=0.8599 | f1=0.8595 | acc=0.8599 | time=0.01s
iter= 500 | prec=0.8613 | rec=0.8614 | f1=0.8611 | acc=0.8614 | time=0.01s
iter=1000 | prec=0.8613 | rec=0.8614 | f1=0.8611 | acc=0.8614 | time=0.01s


In [183]:
df_results = pd.DataFrame(results)
print(df_results.to_string(index=False))

     model vectors  max_iter  precision   recall       f1  accuracy  train_time
SVM-linear     w2v       100   0.841968 0.842632 0.842135  0.842632    0.009505
SVM-linear     w2v       500   0.841968 0.842632 0.842135  0.842632    0.009510
SVM-linear     w2v      1000   0.841968 0.842632 0.842135  0.842632    0.008011
   SVM-rbf     w2v       100   0.858342 0.858553 0.858119  0.858553    0.007022
   SVM-rbf     w2v       500   0.858435 0.858816 0.858386  0.858816    0.007708
   SVM-rbf     w2v      1000   0.858435 0.858816 0.858386  0.858816    0.008003
  SVM-poly     w2v       100   0.859385 0.859868 0.859495  0.859868    0.006510
  SVM-poly     w2v       500   0.861274 0.861447 0.861116  0.861447    0.007153
  SVM-poly     w2v      1000   0.861274 0.861447 0.861116  0.861447    0.006510


In [184]:
best_result = df_results.loc[df_results['f1'].idxmax()]
print(f"\nЛучший результат по F1:")
print(f"Модель: {best_result['model']}")
print(f"Векторы: {best_result['vectors']}")
print(f"Итерации: {best_result['max_iter']}")
print(f"F1: {best_result['f1']:.4f}")
print(f"Accuracy: {best_result['accuracy']:.4f}")


Лучший результат по F1:
Модель: SVM-poly
Векторы: w2v
Итерации: 500
F1: 0.8611
Accuracy: 0.8614


In [187]:
best_result = df_results.loc[df_results['accuracy'].idxmax()]
print(f"\nЛучший результат по accuracy:")
print(f"Модель: {best_result['model']}")
print(f"Векторы: {best_result['vectors']}")
print(f"Итерации: {best_result['max_iter']}")
print(f"F1: {best_result['f1']:.4f}")
print(f"Accuracy: {best_result['accuracy']:.4f}")


Лучший результат по accuracy:
Модель: SVM-poly
Векторы: w2v
Итерации: 500
F1: 0.8611
Accuracy: 0.8614


In [185]:
print("Отбрасывание случайных размерностей:")

best_iter = int(best_result['max_iter'])
drop_dims = [10, 20, 30, 40, 50]

np.random.seed(42)

for n_drop in drop_dims:
    keep_indices = np.random.choice(train_vecs_w2v.shape[1], 
                                   train_vecs_w2v.shape[1] - n_drop, 
                                   replace=False)
    
    X_train_mod = train_vecs_w2v[:, keep_indices]
    X_test_mod = embeddings_w2v[:, keep_indices]
    
    start = time.time()
    clf = LinearSVC(max_iter=best_iter, random_state=42)
    clf.fit(X_train_mod, train_labels)
    train_time = time.time() - start
    
    y_pred = clf.predict(X_test_mod)
    prec, rec, f1, acc = calculate_metrics(test_labels, y_pred)
    
    print(f"отброшено={n_drop:2d} | осталось={X_train_mod.shape[1]:3d} | f1={f1:.4f} | acc={acc:.4f}")

Отбрасывание случайных размерностей:
отброшено=10 | осталось= 90 | f1=0.8426 | acc=0.8432
отброшено=20 | осталось= 80 | f1=0.8342 | acc=0.8349
отброшено=30 | осталось= 70 | f1=0.8279 | acc=0.8288
отброшено=40 | осталось= 60 | f1=0.8197 | acc=0.8208
отброшено=50 | осталось= 50 | f1=0.8219 | acc=0.8225
