In [1]:
from pathlib import Path

from svm_utils import (
    load_embeddings_tsv,
    build_dataset,
    run_linear_svm,
    run_rbf_svm,
    drop_dimensions,
)


In [2]:
BASE_DIR = Path("../..")

TRAIN_DIR = BASE_DIR / "lab1" / "assets" / "annotated-corpus" / "train"
TEST_DIR  = BASE_DIR / "lab1" / "assets" / "annotated-corpus" / "test"

EMB_TRAIN_PATH = BASE_DIR / "lab2" / "output" / "train_embeddings.tsv"
EMB_TEST_PATH = BASE_DIR / "lab2" / "output" / "test_embeddings.tsv"


In [3]:
embeddings_train = load_embeddings_tsv(EMB_TRAIN_PATH)
embeddings_test = load_embeddings_tsv(EMB_TEST_PATH)

X_train, y_train, labels = build_dataset(embeddings_train, TRAIN_DIR)
X_test, y_test, _ = build_dataset(embeddings_test, TEST_DIR)

print("Train:", X_train.shape)
print("Test:", X_test.shape)
print("Labels:", labels)


Train: (5000, 100)
Test: (1250, 100)
Labels: ['neg', 'pos']


In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
results_linear = []

for iters in [50, 100, 500]:
    res = run_linear_svm(
        X_train, y_train,
        X_test, y_test,
        max_iter=iters
    )
    results_linear.append(res)

for r in results_linear:
    print(r)


{'precision': 0.8328690237194258, 'recall': 0.8327999999986675, 'f1': 0.8327913314013393, 'accuracy': 0.8328, 'model': 'LinearSVM', 'kernel': 'linear', 'max_iter': 50, 'n_iter': 5, 'training_time': 0.09451889991760254}
{'precision': 0.8328690237194258, 'recall': 0.8327999999986675, 'f1': 0.8327913314013393, 'accuracy': 0.8328, 'model': 'LinearSVM', 'kernel': 'linear', 'max_iter': 100, 'n_iter': 5, 'training_time': 0.05890369415283203}
{'precision': 0.8328690237194258, 'recall': 0.8327999999986675, 'f1': 0.8327913314013393, 'accuracy': 0.8328, 'model': 'LinearSVM', 'kernel': 'linear', 'max_iter': 500, 'n_iter': 5, 'training_time': 0.04053139686584473}


In [6]:
rbf_result = run_rbf_svm(
    X_train, y_train,
    X_test, y_test,
    max_iter=1000
)

print(rbf_result)




{'precision': 0.834300440869931, 'recall': 0.8167999999986931, 'f1': 0.814370600737226, 'accuracy': 0.8168, 'model': 'SVM', 'kernel': 'rbf', 'max_iter': 1000, 'n_iter': 1000, 'training_time': 0.5767524242401123}


In [7]:
best_iter = 100  # достаточно, модель сходится раньше

for dim in [20, 50, 80]:
    Xtr = drop_dimensions(X_train, dim)
    Xte = drop_dimensions(X_test, dim)

    res = run_linear_svm(
        Xtr, y_train,
        Xte, y_test,
        max_iter=best_iter
    )

    print(f"dim={dim}", res)


dim=20 {'precision': 0.7371211189028831, 'recall': 0.7367999999988212, 'f1': 0.7367108603280328, 'accuracy': 0.7368, 'model': 'LinearSVM', 'kernel': 'linear', 'max_iter': 100, 'n_iter': 5, 'training_time': 0.010999441146850586}
dim=50 {'precision': 0.8051124351322868, 'recall': 0.8047999999987123, 'f1': 0.8047500155029373, 'accuracy': 0.8048, 'model': 'LinearSVM', 'kernel': 'linear', 'max_iter': 100, 'n_iter': 5, 'training_time': 0.017006874084472656}
dim=80 {'precision': 0.8208821458280184, 'recall': 0.8207999999986868, 'f1': 0.8207885299646684, 'accuracy': 0.8208, 'model': 'LinearSVM', 'kernel': 'linear', 'max_iter': 100, 'n_iter': 5, 'training_time': 0.02500128746032715}
