# Lab 2 - Text Vectorization (Векторизация текста)

This notebook demonstrates the complete text vectorization pipeline:
1. Task 1 (Optional): Build token dictionary and term-document matrix
2. Task 2 (Optional): Basic vectorization methods
3. Task 3: Neural network vectorization (Word2Vec)
4. Task 4: Cosine similarity demonstrations
5. Task 5 (Optional): Dimensionality reduction
6. Task 6 (Optional): Compare methods
7. Task 7: Document vectorization pipeline
8. Task 8: Vectorize test set and save in TSV format



In [1]:
# Import necessary libraries
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add source directory to path
sys.path.append('source')

from source.data_loader import load_corpus, get_all_tokens, get_sentences_as_tokens
from source.token_dictionary import TokenDictionary
from source.basic_vectorization import BasicVectorizer
from source.neural_vectorization import NeuralVectorizer
from source.cosine_similarity import cosine_distance, demonstrate_semantic_similarity, find_most_similar_words
from source.document_vectorizer import DocumentVectorizer
from source.dimensionality_reduction import DimensionalityReducer

# Set up plotting style
try:
    plt.style.use('seaborn-v0_8')
except:
    try:
        plt.style.use('seaborn')
    except:
        pass
sns.set_palette("husl")

print("Libraries imported successfully!")



Libraries imported successfully!


## Task 1 (Optional): Build Token Dictionary and Term-Document Matrix

Load training data and build token dictionary with frequencies and term-document matrix.



In [2]:
# Load training data
lab1_corpus_dir = "../lab1/assets/annotated_corpus"
print("Loading training data...")
train_docs, train_labels, train_ids = load_corpus(lab1_corpus_dir, split='train')
train_tokens = get_all_tokens(train_docs, use_lemma=True)

print(f"Loaded {len(train_docs)} training documents")
print(f"Total tokens: {sum(len(doc) for doc in train_tokens)}")

# Build token dictionary
print("\nBuilding token dictionary...")
token_dict = TokenDictionary(min_frequency=2, remove_stopwords=True, remove_punctuation=True)
token_dict.build_dictionary(train_tokens)

# Build term-document matrix
print("\nBuilding term-document matrix...")
token_dict.build_term_document_matrix(train_tokens)

# Save results
os.makedirs('assets/dictionaries', exist_ok=True)
os.makedirs('assets/matrices', exist_ok=True)
token_dict.save_dictionary('assets/dictionaries/token_dictionary.json')
token_dict.save_term_document_matrix('assets/matrices/term_document_matrix.pkl')

print(f"\nVocabulary size: {token_dict.get_vocab_size()}")
print("Dictionary and matrix saved!")



Loading training data...


Loading train data: 100%|██████████| 121884/121884 [00:35<00:00, 3472.75files/s]


Loaded 121884 training documents
Total tokens: 3964419

Building token dictionary...
Built dictionary with 35311 tokens
Total tokens processed: 2466038
Tokens filtered out (frequency < 2): 19589

Building term-document matrix...
Built term-document matrix: (35311, 121884)
Matrix density: 0.0005
Saved dictionary to assets/dictionaries/token_dictionary.json
Saved term-document matrix to assets/matrices/term_document_matrix.pkl

Vocabulary size: 35311
Dictionary and matrix saved!


## Task 3: Neural Network Vectorization (Word2Vec)

Train Word2Vec model on training data.



In [3]:
# Prepare data for Word2Vec (list of sentences)
train_sentences = get_sentences_as_tokens(train_docs, use_lemma=True)
train_sentences_flat = []
for doc in train_sentences:
    train_sentences_flat.extend(doc)

print(f"Total sentences for training: {len(train_sentences_flat)}")

# Train Word2Vec model
neural_model = NeuralVectorizer(
    model_type='word2vec',
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=0  # CBOW
)

neural_model.train(train_sentences_flat, epochs=10)

# Save model
os.makedirs('assets/models', exist_ok=True)
neural_model.save('assets/models/word2vec_model.model')

print(f"\nModel trained! Vocabulary size: {len(neural_model.wv)}")



Total sentences for training: 121884
Training word2vec model...
  Vector size: 100
  Window: 5
  Min count: 2
  Training algorithm: CBOW
  Number of documents: 121884
Training completed! Vocabulary size: 43609
Saved model to assets/models/word2vec_model.model

Model trained! Vocabulary size: 43609


## Task 4: Demonstrate Cosine Similarity

Show that semantically close words have smaller cosine distance.



In [4]:
# Define test words for demonstration
test_words = {
    'sport': {
        'similar': ['game', 'athlete', 'player', 'match'],
        'related': ['team', 'competition', 'football', 'basketball'],
        'distant': ['computer', 'philosophy', 'mathematics', 'sentence']
    },
    'technology': {
        'similar': ['computer', 'software', 'digital', 'electronic'],
        'related': ['internet', 'device', 'system', 'network'],
        'distant': ['sport', 'animal', 'cooking', 'music']
    },
    'business': {
        'similar': ['company', 'market', 'trade', 'commerce'],
        'related': ['economy', 'finance', 'investment', 'profit'],
        'distant': ['nature', 'art', 'science', 'philosophy']
    }
}

# Demonstrate semantic similarity
results = demonstrate_semantic_similarity(neural_model, test_words)




Base word: 'sport'

Similar words:
  athlete              - distance: 0.3720
  player               - distance: 0.6063
  game                 - distance: 0.8501
  match                - distance: 0.9130

Related words:
  football             - distance: 0.4490
  competition          - distance: 0.5827
  basketball           - distance: 0.5923
  team                 - distance: 0.6785

Distant words:
  computer             - distance: 0.7200
  philosophy           - distance: 0.7905
  mathematics          - distance: 0.8618
  sentence             - distance: 1.0975

Base word: 'technology'

Similar words:
  software             - distance: 0.3881
  digital              - distance: 0.4694
  computer             - distance: 0.5214
  electronic           - distance: 0.5690

Related words:
  device               - distance: 0.4066
  network              - distance: 0.4311
  system               - distance: 0.4526
  internet             - distance: 0.6902

Distant words:
  sport            

In [5]:
# Find most similar words using built-in method
test_word = 'sport'
similar_words = find_most_similar_words(neural_model, test_word, top_n=10)

print(f"\nMost similar words to '{test_word}':")
for word, similarity in similar_words:
    print(f"  {word:20s} - similarity: {similarity:.4f}")




Most similar words to 'sport':
  art                  - similarity: 0.6502
  athlete              - similarity: 0.6280
  talent               - similarity: 0.6080
  wine                 - similarity: 0.5780
  baseball             - similarity: 0.5728
  spirit               - similarity: 0.5691
  elite                - similarity: 0.5526
  arena                - similarity: 0.5513
  football             - similarity: 0.5510
  science              - similarity: 0.5482


## Task 7: Document Vectorization Pipeline

Implement complete document vectorization: sentences → tokens → token vectors → sentence vectors → document vector.



In [6]:
# Initialize document vectorizer
doc_vectorizer = DocumentVectorizer(neural_model)

# Example: Vectorize a sample document
sample_text = "The technology company announced new software products. The digital market is growing rapidly."

# Get sentences
sentences = doc_vectorizer.segment_text(sample_text)
print("Segmented sentences:")
for i, sent in enumerate(sentences):
    print(f"  Sentence {i+1}: {sent}")

# Vectorize document
doc_vector = doc_vectorizer.vectorize_document(sample_text, use_tfidf_weights=False)
print(f"\nDocument vector shape: {doc_vector.shape}")
print(f"Document vector (first 10 components): {doc_vector[:10]}")



Segmented sentences:
  Sentence 1: ['the', 'technology', 'company', 'announced', 'new', 'software', 'products']
  Sentence 2: ['the', 'digital', 'market', 'is', 'growing', 'rapidly']

Document vector shape: (100,)
Document vector (first 10 components): [-0.19103011 -0.69757634 -0.18636397 -0.3347658  -0.3205855  -0.53389674
  1.3607087  -1.139126    0.7254089  -0.41075262]


## Task 8: Vectorize Test Set and Save in TSV Format

Vectorize all test documents and save embeddings in TSV format.



In [7]:
# Load test data
print("Loading test data...")
test_docs, test_labels, test_ids = load_corpus(lab1_corpus_dir, split='test')
test_sentences = get_sentences_as_tokens(test_docs, use_lemma=True)

print(f"Loaded {len(test_docs)} test documents")

# Vectorize test documents
print("\nVectorizing test documents...")
embeddings = []
valid_doc_ids = []

from tqdm import tqdm
for doc_sentences, doc_id in tqdm(zip(test_sentences, test_ids), total=len(test_docs)):
    doc_vector = doc_vectorizer.vectorize_document_from_tokens(
        doc_sentences,
        use_tfidf_weights=False
    )
    embeddings.append(doc_vector)
    valid_doc_ids.append(doc_id)

embeddings = np.array(embeddings)
print(f"\nGenerated embeddings shape: {embeddings.shape}")



Loading test data...


Loading test data: 100%|██████████| 7600/7600 [00:02<00:00, 3157.70files/s]


Loaded 7600 test documents

Vectorizing test documents...


100%|██████████| 7600/7600 [00:03<00:00, 2489.64it/s]


Generated embeddings shape: (7600, 100)





In [8]:
# Save embeddings in TSV format
os.makedirs('assets/embeddings', exist_ok=True)
output_file = 'assets/embeddings/test_embeddings.tsv'

with open(output_file, 'w', encoding='utf-8') as f:
    for doc_id, embedding in zip(valid_doc_ids, embeddings):
        embedding_str = '\t'.join([f"{val:.6f}" for val in embedding])
        f.write(f"{doc_id}\t{embedding_str}\n")

print(f"Saved embeddings to {output_file}")
print(f"Total documents: {len(valid_doc_ids)}")
print(f"Embedding dimension: {embeddings.shape[1]}")

# Show first few lines
print("\nFirst 3 lines of output file:")
with open(output_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 3:
            parts = line.strip().split('\t')
            print(f"  Doc ID: {parts[0]}, First 5 components: {parts[1:6]}")



Saved embeddings to assets/embeddings/test_embeddings.tsv
Total documents: 7600
Embedding dimension: 100

First 3 lines of output file:
  Doc ID: 1003, First 5 components: ['-0.775902', '-0.199960', '0.406633', '-0.609266', '-0.205569']
  Doc ID: 1017, First 5 components: ['1.457490', '-4.045797', '1.432831', '-2.652643', '0.766079']
  Doc ID: 1021, First 5 components: ['-0.218004', '-0.991015', '-0.162545', '-0.767218', '-0.116455']
