In [1]:
!pip install gensim tqdm


You should consider upgrading via the '/Users/maksim/Документы/jupiter/venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import re
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from tqdm import tqdm




In [3]:
from pathlib import Path

BASE_CORPUS_DIR = Path("projects/ms-pynko/assets/annotated-corpus")

def tokens_from_tsv(tsv_path: Path, col: int = 1, drop_punct: bool = False):
    """
    Возвращает список токенов из tsv-файла.
    col: 0/1/2 — какая колонка используется как токен.
    drop_punct: если True, выкидывает токены, где нет букв/цифр.
    """
    tokens = []
    with tsv_path.open("r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue  # иногда пустые строки
            parts = line.split("\t")
            if len(parts) < 3:
                # если вдруг разделено пробелами, подстрахуемся
                parts = line.split()
            if len(parts) <= col:
                continue
            tok = parts[col].strip()
            if not tok:
                continue
            if drop_punct:
                # оставляем токены, где есть буквы/цифры
                if not any(ch.isalnum() for ch in tok):
                    continue
            tokens.append(tok)
    return tokens


In [5]:
sample_file = BASE_CORPUS_DIR / "train" / "1" / "000492.tsv"
print(sample_file.exists(), sample_file)
print(tokens_from_tsv(sample_file, col=1)[:20])


True projects/ms-pynko/assets/annotated-corpus/train/1/000492.tsv
['venezuelan', 'vote', 'earli', 'in', 'referendum', 'on', 'chavez', 'rule', '(', 'reuter', ')', '.', 'reuter', '-', 'venezuelan', 'turn', 'out', 'earli', '\\', 'and']


In [6]:
def load_split_docs(split: str, col: int = 1, drop_punct: bool = False):
    """
    split: 'train' или 'test'
    Возвращает:
      doc_ids: список идентификаторов документов (имена файлов без .tsv)
      docs:    список списков токенов (по документу)
      labels:  список тем (1..4) как строки
    """
    split_dir = BASE_CORPUS_DIR / split
    doc_ids, docs, labels = [], [], []
    
    for label_dir in sorted(split_dir.iterdir()):
        if not label_dir.is_dir():
            continue
        label = label_dir.name  # '1','2','3','4'
        for tsv_path in sorted(label_dir.glob("*.tsv")):
            toks = tokens_from_tsv(tsv_path, col=col, drop_punct=drop_punct)
            if not toks:
                continue
            doc_ids.append(tsv_path.stem)
            docs.append(toks)
            labels.append(label)
    
    return doc_ids, docs, labels

train_ids, train_docs, train_labels = load_split_docs("train", col=1, drop_punct=False)
test_ids,  test_docs,  test_labels  = load_split_docs("test",  col=1, drop_punct=False)

len(train_docs), len(test_docs), train_ids[:3], train_labels[:3]


(120000, 7600, ['000492', '000493', '000494'], ['1', '1', '1'])

In [7]:
from gensim.models import Word2Vec

EMBEDDING_DIM = 100

w2v_model = Word2Vec(
    sentences=train_docs,   # <-- вместо regex/предложений
    vector_size=EMBEDDING_DIM,
    window=5,
    min_count=3,
    workers=4,
    sg=1,
    epochs=10,
)


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

In [8]:
import numpy as np

def doc_to_vector(tokens, model: Word2Vec, dim: int):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(dim, dtype="float32")
    return np.mean(vecs, axis=0)


In [9]:
import pandas as pd
from tqdm import tqdm

embeddings = []
doc_ids = []

for doc_id, toks in tqdm(zip(test_ids, test_docs), total=len(test_docs), desc="Vectorize test"):
    embeddings.append(doc_to_vector(toks, w2v_model, EMBEDDING_DIM))
    doc_ids.append(doc_id)

embeddings = np.vstack(embeddings)

result_df = pd.DataFrame(embeddings)
result_df.insert(0, "doc_id", doc_ids)

OUTPUT_PATH = "test_embeddings_w2v.tsv"
result_df.to_csv(OUTPUT_PATH, sep="\t", index=False, header=False)

OUTPUT_PATH


Vectorize test: 100%|████████████████| 7600/7600 [00:00<00:00, 19088.91it/s]


'test_embeddings_w2v.tsv'

In [14]:
import pandas as pd

TRAIN_PATH = "projects/ms-pynko/assets/dataset/news_train.csv"
TEST_PATH  = "projects/ms-pynko/assets/dataset/news_test.csv"

train_df = pd.read_csv(
    TRAIN_PATH,
    sep=",",
    header=None,
    names=["label", "title", "text"],   # сами задаём имена колонок
)

test_df = pd.read_csv(
    TEST_PATH,
    sep=",",
    header=None,
    names=["title", "text"],
)

train_df.head(), train_df.columns


(   label                                              title  \
 0      3  Wall St. Bears Claw Back Into the Black (Reuters)   
 1      3  Carlyle Looks Toward Commercial Aerospace (Reu...   
 2      3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
 3      3  Iraq Halts Oil Exports from Main Southern Pipe...   
 4      3  Oil prices soar to all-time record, posing new...   
 
                                                 text  
 0  Reuters - Short-sellers, Wall Street's dwindli...  
 1  Reuters - Private investment firm Carlyle Grou...  
 2  Reuters - Soaring crude prices plus worries\ab...  
 3  Reuters - Authorities have halted oil export\f...  
 4  AFP - Tearaway world oil prices, toppling reco...  ,
 Index(['label', 'title', 'text'], dtype='object'))

In [15]:
import re
import numpy as np

TEXT_COL = "text"   # колонка с текстом

TOKEN_RE = re.compile(r"[a-zA-Zа-яА-ЯёЁ]+", re.UNICODE)

def text_to_tokens(text: str):
    text = str(text).lower()
    tokens = TOKEN_RE.findall(text)
    return tokens

# проверка
example_text = train_df[TEXT_COL].iloc[0]
text_to_tokens(example_text)[:20]


['reuters',
 'short',
 'sellers',
 'wall',
 'street',
 's',
 'dwindling',
 'band',
 'of',
 'ultra',
 'cynics',
 'are',
 'seeing',
 'green',
 'again']

In [16]:
def split_to_sentences(text: str):
    # грубое разбиение по .?!…
    sentences = re.split(r"[.!?…]+", str(text))
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

sentences_tokens = []

from tqdm import tqdm

for text in tqdm(train_df[TEXT_COL], desc="Подготовка предложений"):
    for sent in split_to_sentences(text):
        tokens = text_to_tokens(sent)
        if tokens:
            sentences_tokens.append(tokens)

len(sentences_tokens), sentences_tokens[0][:10]


Подготовка предложений: 100%|███████| 120000/120000 [00:00<00:00, 158673.90it/s]


(214647,
 ['reuters',
  'short',
  'sellers',
  'wall',
  'street',
  's',
  'dwindling',
  'band',
  'of',
  'ultra'])

In [17]:
from gensim.models import Word2Vec

EMBEDDING_DIM = 100

w2v_model = Word2Vec(
    sentences=sentences_tokens,
    vector_size=EMBEDDING_DIM,
    window=5,
    min_count=3,
    workers=4,
    sg=1,
    epochs=10,
)

list(w2v_model.wv.key_to_index.keys())[:10]


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


['the', 'a', 'to', 'of', 'in', 'and', 's', 'on', 'for', 'that']

In [18]:
def get_word_vector(word: str, model: Word2Vec, dim: int):
    if word in model.wv:
        return model.wv[word]
    else:
        return np.zeros(dim, dtype="float32")

def cosine_distance(vec1: np.ndarray, vec2: np.ndarray):
    num = np.dot(vec1, vec2)
    den = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if den == 0:
        return 1.0
    return 1.0 - num / den


In [19]:
def demo_word_groups(anchor, similar_words, same_domain_words, different_words, model, dim):
    def dist(w):
        v1 = get_word_vector(anchor, model, dim)
        v2 = get_word_vector(w, model, dim)
        return cosine_distance(v1, v2)
    
    print(f"Якорное слово: {anchor}\n")
    for name, group in [
        ("Похожие по смыслу", similar_words),
        ("Из той же предметной области", same_domain_words),
        ("Далёкие по смыслу", different_words),
    ]:
        print(name + ":")
        for w in group:
            print(f"  {w:<20} dist = {dist(w):.4f}")
        print()

demo_word_groups(
    anchor="oil",                     # пример, поменяй при желании
    similar_words=["crude", "price"],
    same_domain_words=["stock", "market"],
    different_words=["cat", "phone"],
    model=w2v_model,
    dim=EMBEDDING_DIM
)


Якорное слово: oil

Похожие по смыслу:
  crude                dist = 0.1553
  price                dist = 0.5437

Из той же предметной области:
  stock                dist = 0.5449
  market               dist = 0.6982

Далёкие по смыслу:
  cat                  dist = 0.8120
  phone                dist = 0.7263



In [20]:
def sentence_to_vector(sentence: str, model: Word2Vec, dim: int):
    tokens = text_to_tokens(sentence)
    if not tokens:
        return np.zeros(dim, dtype="float32")
    vecs = [get_word_vector(t, model, dim) for t in tokens]
    if not vecs:
        return np.zeros(dim, dtype="float32")
    return np.mean(vecs, axis=0)

def text_to_document_vector(text: str, model: Word2Vec, dim: int):
    sentences = split_to_sentences(text)
    sent_vecs = []
    for sent in sentences:
        v = sentence_to_vector(sent, model, dim)
        if np.linalg.norm(v) > 0:
            sent_vecs.append(v)
    if not sent_vecs:
        return np.zeros(dim, dtype="float32")
    return np.mean(sent_vecs, axis=0)

# проверка
doc_vec = text_to_document_vector(train_df[TEXT_COL].iloc[0], w2v_model, EMBEDDING_DIM)
doc_vec.shape, doc_vec[:10]


((100,),
 array([ 0.02629399,  0.00785776, -0.06658947,  0.183035  , -0.24346587,
        -0.27846712,  0.06121453,  0.25208753, -0.17313185, -0.20860425],
       dtype=float32))

In [21]:
embeddings = []
doc_ids = []

for idx, text in tqdm(enumerate(test_df["text"]), total=len(test_df), desc="Векторизация теста"):
    vec = text_to_document_vector(text, w2v_model, EMBEDDING_DIM)
    embeddings.append(vec)
    doc_ids.append(str(idx))   # или f"{idx:03d}", если нужны нули спереди

embeddings = np.vstack(embeddings)

result_df = pd.DataFrame(embeddings)
result_df.insert(0, "doc_id", doc_ids)

OUTPUT_PATH = "test_embeddings_w2v.tsv"
result_df.to_csv(OUTPUT_PATH, sep="\t", index=False, header=False)

OUTPUT_PATH


Векторизация теста: 100%|████████████████| 7600/7600 [00:00<00:00, 19472.90it/s]


'test_embeddings_w2v.tsv'