In [12]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec

df = pd.read_csv('train.csv', header=None)
texts = df.iloc[:, 2].dropna().astype(str).tolist()

def preprocess(text):
    if not isinstance(text, str): return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

corpus = [preprocess(t) for t in texts]
corpus = [t for t in corpus if t]

model = Word2Vec(
    sentences=corpus,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    epochs=10
)

In [21]:
token_sets = [
        {
        'base_token': 'football',
        'similar': ['soccer', 'baseball', 'hockey'],
        'same_domain': ['player', 'team', 'game'],
        'different': ['painting', 'chemistry', 'universe']
    },
    {
        'base_token': 'airplane',
        'similar': ['aircraft', 'jet', 'plane'],
        'same_domain': ['airport', 'flight', 'pilot'],
        'different': ['mathematics', 'cat', 'fish']
    },
    {
        'base_token': 'physics',
        'similar': ['science', 'chemistry', 'biology'],
        'same_domain': ['experiment', 'research', 'theory'],
        'different': ['music', 'dance', 'song']
    }
]

In [22]:
from scipy.spatial.distance import cosine

def get_distance(word1, word2):
    if word1 in model.wv and word2 in model.wv:
        return cosine(model.wv[word1], model.wv[word2])
    return None

for token_set in token_sets:
    base = token_set['base_token']
    
    if base not in model.wv:
        continue
    
    all_tokens = []
    distances = []
    
    for token in token_set['similar']:
        dist = get_distance(base, token)
        if dist is not None:
            all_tokens.append((token, 'похожий'))
            distances.append(dist)
    
    for token in token_set['same_domain']:
        dist = get_distance(base, token)
        if dist is not None:
            all_tokens.append((token, 'та же область'))
            distances.append(dist)
    
    for token in token_set['different']:
        dist = get_distance(base, token)
        if dist is not None:
            all_tokens.append((token, 'разные'))
            distances.append(dist)
    
    sorted_items = sorted(zip(all_tokens, distances), key=lambda x: x[1])
    
    print(f"Базовый токен: '{base}'")
    print("Ранжированный список по косинусному расстоянию:")
    for i, ((token, category), dist) in enumerate(sorted_items):
        print(f"  {i+1}. '{token}' [{category}] -> {dist:.4f}")
    print()

Базовый токен: 'football'
Ранжированный список по косинусному расстоянию:
  1. 'hockey' [похожий] -> 0.3408
  2. 'soccer' [похожий] -> 0.3504
  3. 'baseball' [похожий] -> 0.4138
  4. 'team' [та же область] -> 0.5268
  5. 'player' [та же область] -> 0.5351
  6. 'game' [та же область] -> 0.5628
  7. 'universe' [разные] -> 0.8251
  8. 'chemistry' [разные] -> 0.8639
  9. 'painting' [разные] -> 0.8842

Базовый токен: 'airplane'
Ранжированный список по косинусному расстоянию:
  1. 'aircraft' [похожий] -> 0.3949
  2. 'jet' [похожий] -> 0.4619
  3. 'plane' [похожий] -> 0.4785
  4. 'flight' [та же область] -> 0.6951
  5. 'pilot' [та же область] -> 0.7306
  6. 'airport' [та же область] -> 0.7383
  7. 'fish' [разные] -> 0.7554
  8. 'cat' [разные] -> 0.8957
  9. 'mathematics' [разные] -> 0.9603

Базовый токен: 'physics'
Ранжированный список по косинусному расстоянию:
  1. 'science' [похожий] -> 0.3290
  2. 'chemistry' [похожий] -> 0.4235
  3. 'biology' [похожий] -> 0.4628
  4. 'research' [та же об

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')

def vectorize_document(text, vector_size=100):
    sentences = sent_tokenize(text)
    
    sentence_vectors = []
    
    for sentence in sentences:
        tokens = preprocess(sentence)
        
        if not tokens:
            continue
            
        token_vectors = []
        
        for token in tokens:
            if token in model.wv:
                token_vectors.append(model.wv[token])
        
        if token_vectors:
            sentence_vector = np.mean(token_vectors, axis=0)
            sentence_vectors.append(sentence_vector)
    
    if sentence_vectors:
        document_vector = np.mean(sentence_vectors, axis=0)
        return document_vector
    else:
        return np.zeros(vector_size)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lev\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
df_test = pd.read_csv('test.csv', header=None)
test_texts = df_test.iloc[:, 2].dropna().astype(str).tolist()

In [17]:
filename = 'train_data.tsv'
with open(filename, 'w', encoding='utf-8') as f:
    for doc_id, text in enumerate(texts, 0):
        vector = vectorize_document(text)
        vector_str = "\t".join([str(f"{val:.6f}") for val in vector])
        line = f"{doc_id+1}\t{vector_str}\n"
        f.write(line)

In [18]:
filename = 'test_data.tsv'
with open(filename, 'w', encoding='utf-8') as f:
    for doc_id, text in enumerate(test_texts, 0):
        vector = vectorize_document(text)
        vector_str = "\t".join([str(f"{val:.6f}") for val in vector])
        line = f"{doc_id+1}\t{vector_str}\n"
        f.write(line)