In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from pathlib import Path

from text_vectorizer import TextVectorizer

In [3]:
BASE_DIR = Path("../..")  # подстрой при необходимости
TRAIN_DIR = BASE_DIR/  "lab1" / "assets" / "annotated-corpus" / "train"
TEST_DIR = BASE_DIR / "lab1" / "assets" / "annotated-corpus" / "test"
OUTPUT_DIR = BASE_DIR / "lab2" / "output"

In [4]:
TRAIN_DIR

WindowsPath('../../lab1/assets/annotated-corpus/train')

In [5]:
vectorizer = TextVectorizer()

sentences, doc_ids = vectorizer.load_corpus(TRAIN_DIR)

print(f"Документов: {len(doc_ids)}")
print(f"Предложений: {len(sentences)}")
print("Пример предложения:", sentences[0][:15])

INFO - Загрузка из neg...
INFO - Загрузка из pos...
INFO - Загружено предложений: 58376


Документов: 5000
Предложений: 58376
Пример предложения: ['rented', 'curious', 'yellow', 'video', 'store', 'controversy', 'surrounded', 'first', 'released']


In [6]:
vectorizer.train_word2vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=2,
    epochs=30
)

print("Размер словаря:", len(vectorizer.vocab))
print("Размерность вектора:", vectorizer.model.wv.vector_size)

INFO - Word2Vec: size=100, window=5, min_count=2, epochs=30
INFO - collecting all words and their counts
INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - PROGRESS: at sentence #10000, processed 97030 words, keeping 13394 word types
INFO - PROGRESS: at sentence #20000, processed 194465 words, keeping 18753 word types
INFO - PROGRESS: at sentence #30000, processed 289567 words, keeping 22545 word types
INFO - PROGRESS: at sentence #40000, processed 393325 words, keeping 25905 word types
INFO - PROGRESS: at sentence #50000, processed 500175 words, keeping 29265 word types
INFO - collected 31324 word types from a corpus of 587368 raw words and 58376 sentences
INFO - Creating a fresh vocabulary
INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=2 retains 19267 unique words (61.51% of original 31324, drops 12057)', 'datetime': '2026-01-11T12:37:19.266912', 'gensim': '4.4.0', 'python': '3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 6

Размер словаря: 19267
Размерность вектора: 100


In [7]:
pairs = [
    ("company", "firm"),
    ("company", "football"),
    ("president", "leader"),
    ("president", "computer"),
    ("game", "match"),
    ("game", "economy")
]

for w1, w2 in pairs:
    if vectorizer.word_exists(w1) and vectorizer.word_exists(w2):
        d = vectorizer.cosine_distance(w1, w2)
        print(f"{w1:12s} - {w2:12s} : {d:.4f}")


company      - firm         : 0.6640
company      - football     : 0.8624
president    - leader       : 0.6358
president    - computer     : 0.7861
game         - match        : 0.8129
game         - economy      : 0.7474


In [8]:
vectorizer.find_similar("president", topn=5)

[('declaring', 0.5939064025878906),
 ('truman', 0.5874928832054138),
 ('clinton', 0.5788647532463074),
 ('chemist', 0.5717450976371765),
 ('chavez', 0.568702757358551)]

In [9]:
example_doc = sentences[:10]  # первые несколько предложений

doc_vector = vectorizer.vectorize_document(example_doc)


print("Размерность вектора документа:", doc_vector.shape)
print("Первые 10 компонент:", doc_vector[:10])


Размерность вектора документа: (100,)
Первые 10 компонент: [-0.02026899 -0.06403017 -0.06056681  0.04577107  0.16468099 -0.31064433
 -0.0513248   0.14034262 -0.03002297 -0.12095964]


In [10]:
embeddings = vectorizer.vectorize_corpus(TEST_DIR)

print("Документов в test:", len(embeddings))

list(embeddings.items())[:1]


INFO - Векторизация neg...
neg: 100%|██████████| 625/625 [00:00<00:00, 2957.81it/s]
INFO - Векторизация pos...
pos: 100%|██████████| 625/625 [00:00<00:00, 3467.32it/s]

Документов в test: 1250





[('0',
  array([-0.10098692, -0.00471969, -0.10134963, -0.10463025,  0.10807797,
         -0.32576427, -0.00843867,  0.14228499, -0.0978757 , -0.26883346,
          0.04876848, -0.27228478, -0.08419684,  0.27713534,  0.15411104,
         -0.2536778 ,  0.00530113, -0.3045705 ,  0.04346852, -0.34369445,
          0.26753357,  0.02436877,  0.0727244 , -0.27379137, -0.06408948,
         -0.0278195 , -0.27042884,  0.01560871, -0.18718022, -0.17074893,
          0.30652958, -0.22429262,  0.07791907, -0.16026552, -0.1257699 ,
          0.25747067,  0.10882579, -0.10298365, -0.02632362, -0.27136743,
          0.01714168, -0.13812323, -0.12488966, -0.00538344,  0.26926103,
         -0.22147258, -0.02635036, -0.12747245,  0.03783369,  0.29377532,
          0.05218723, -0.21119767, -0.16407911, -0.04741263,  0.05374223,
         -0.04123229,  0.0659887 , -0.02810857, -0.3111162 ,  0.05300539,
          0.17389663,  0.0853562 ,  0.13796118,  0.20026925, -0.19769827,
          0.18487176,  0.151321

In [11]:
output_path = OUTPUT_DIR / "test_embeddings.tsv"

vectorizer.save_embeddings_tsv(embeddings, output_path)

output_path


INFO - Сохранено: ..\..\lab2\output\test_embeddings.tsv (документов: 1250)


WindowsPath('../../lab2/output/test_embeddings.tsv')

In [12]:
embeddings_train = vectorizer.vectorize_corpus(TRAIN_DIR)
output_path_train = OUTPUT_DIR / "train_embeddings.tsv"

vectorizer.save_embeddings_tsv(embeddings_train, output_path_train)

output_path_train

INFO - Векторизация neg...
neg: 100%|██████████| 2500/2500 [00:00<00:00, 2557.82it/s]
INFO - Векторизация pos...
pos: 100%|██████████| 2500/2500 [00:00<00:00, 2582.72it/s]
INFO - Сохранено: ..\..\lab2\output\train_embeddings.tsv (документов: 5000)


WindowsPath('../../lab2/output/train_embeddings.tsv')

In [13]:
with open(output_path, 'r', encoding='utf-8') as f:
    line = f.readline().strip().split('\t')

print("Doc ID:", line[0])
print("Vector length:", len(line) - 1)


Doc ID: 0
Vector length: 100


In [14]:
train_doc_ids, train_vocab, train_tdm, vect = vectorizer.build_term_document_matrix(
    TRAIN_DIR,
    max_features=3000
)

test_doc_ids, _, test_tdm, _ = vectorizer.build_term_document_matrix(
    TEST_DIR,
    max_features=3000,
    vectorizer=vect
)

print("Train TDM shape:", train_tdm.shape)
print("Test TDM shape:", test_tdm.shape)


INFO - Построение term-document matrix...
INFO - Документов: 5000
INFO - TDM built: (5000, 3000)
INFO - Построение term-document matrix...
INFO - Документов: 1250
INFO - TDM built: (1250, 3000)


Train TDM shape: (5000, 3000)
Test TDM shape: (1250, 3000)


In [15]:
vectorizer.save_tdm_tsv(
    train_doc_ids,
    train_tdm,
    OUTPUT_DIR / "train_tdm.tsv"
)

vectorizer.save_tdm_tsv(
    test_doc_ids,
    test_tdm,
    OUTPUT_DIR / "test_tdm.tsv"
)

vectorizer.save_vocab(
    train_vocab,
    OUTPUT_DIR / "vocab.txt"
)


INFO - TDM сохранена: ..\..\lab2\output\train_tdm.tsv
INFO - TDM сохранена: ..\..\lab2\output\test_tdm.tsv
INFO - Словарь сохранён: ..\..\lab2\output\vocab.txt
