In [8]:
from typing import List, Tuple
import spacy
from scipy.spatial.distance import cosine

In [9]:
class ContextEmbedding:

    def __init__(self, window_size: int, vocab_size: int):
        
        self.window_size = window_size
        self.vocab_size = vocab_size
        self.nlp = spacy.load("en_core_web_md")
        self.vocab = []
        self.dictionary = {}
        self.embeddings = {}
        self.similarities = {}

Это функия для предобработки текста: здесь токенизация, лемматизация, удаление знаков препинания и прочего мусора

In [10]:
def preprocess_text(self, text: str) -> List[str]:
        
        doc = self.nlp(text)
        words = []

        for token in doc:
            if token.is_stop or token.is_punct or not token.is_alpha:
                continue

            if token.pos_ not in ['NOUN', 'VERB', 'ADJ', 'ADV']:
                continue
            
            words.append(token.lemma_)

        return words

Здесь мы подготавливаем корпус (строим словарь и матрицы для эмбеддингов)

In [11]:
def preprocess_corpus(self, corpus: List[str]) -> None:
        
        vocab_counts = {}
        for text in corpus:
            words = self.preprocess_text(text)

            for word in words:
                if word in vocab_counts:
                    vocab_counts[word] += 1
                else:
                    vocab_counts[word] = 1

        self.vocab = sorted(vocab_counts.items(), key=lambda x: x[1], reverse=True)[:self.vocab_size]
        self.dictionary = {word[0]: idx for idx, word in enumerate(self.vocab)}
        embeddings = []

        for _ in range(len(self.vocab)):
            embeddings.append([0] * len(self.vocab))

        for text in corpus:
            words = self.preprocess_text(text)

            for idx, word in enumerate(words):
                if word not in self.dictionary:
                    continue

                for j in range(max(0, idx - self.window_size), idx):
                    if j >= len(words) or words[j] not in self.dictionary:
                        continue
                    embeddings[self.dictionary[word]][self.dictionary[words[j]]] += 1

                for j in range(idx + 1, min(len(words), idx + self.window_size + 1)):
                    if words[j] not in self.dictionary:
                        continue
                    embeddings[self.dictionary[word]][self.dictionary[words[j]]] += 1

        self.embeddings = {self.vocab[idx][0]: embedding for idx, embedding in enumerate(embeddings)}
        
        for word1 in self.embeddings:
            for word2 in self.embeddings:
                if word1 not in self.similarities:
                    self.similarities[word1] = []
                dist = cosine(self.embeddings[word1], self.embeddings[word2])
                self.similarities[word1].append((word2, dist))
            self.similarities[word1] = sorted(self.similarities[word1], key=lambda x: x[1])

Загружаем и обрабатываем данные из файлов корпуса

In [12]:
def train(self, corpus_files: List[str]) -> None:

        corpus = []
        for file_path in corpus_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
                corpus.append(text)
                
        self.preprocess_corpus(corpus)

Ищем наиболее похожие слова по контексту слова

In [13]:
def get_similarity(self, word: str, top_n: int) -> List[Tuple[str, float]]:

        if word not in self.similarities:
            return []
        return self.similarities[word][:top_n]

In [14]:
context_embedding = ContextEmbedding(window_size=5, vocab_size=1000)
context_embedding.train()
similar_words = context_embedding.get_similarity('apple', top_n=5)
print(similar_words)

AttributeError: 'ContextEmbedding' object has no attribute 'train'