# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click !here goes the icon of the corresponding button in the gutter! button.
To debug a cell, press Alt+Shift+Enter, or click !here goes the icon of the corresponding button in the gutter! button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/jupyter-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [5]:
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet2022 as wn22
import nltk

[nltk_data] Downloading package wordnet to /Users/bogdan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bogdan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet2022 to
[nltk_data]     /Users/bogdan/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!


True

In [7]:
# print(len([1 for _ in wn22.all_synsets()]))

### Prepare a partial segment of the WordNet2022 dataset

In [8]:
WORDS_PER_POS = 300
EMBEDDING_DIM = 35

In [9]:
synset_dataset_partial = []


def get_synset_data(pos):
    for i, synset in enumerate(wn22.all_synsets(pos=pos)):
        if i == WORDS_PER_POS:
            break
        data = {
            "name": synset.name(),
            "lemmas": synset.lemma_names(),
            "definition": synset.definition(),
            'pos': synset.pos(),
            "examples": synset.examples(),
        }
        synset_dataset_partial.append(data)


poss = ["a", "n", "v", "r", "s"]
for p in poss:
    get_synset_data(p)

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [11]:
lema = WordNetLemmatizer()
stop_words = set(sw.words('english'))

In [12]:
def process_text(text):
    tokens = word_tokenize(text)
    tokens = [lema.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

In [13]:
vocabulary = set()
for synset in synset_dataset_partial:
    definition_tokens = process_text(synset['definition'])
    vocabulary.update(definition_tokens)
    examples_tokens = []
    for example in synset['examples']:
        example_tokens = process_text(example)
        examples_tokens.extend(example_tokens)
    vocabulary.update(synset['lemmas'])
    vocabulary.update(examples_tokens)
    synset['definition_tokens'] = definition_tokens
    synset['examples_tokens'] = examples_tokens
    

In [14]:
words_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_words = {idx: word for word, idx in words_to_idx.items()}

In [15]:
synset_to_idx = {synset['name']: idx for idx, synset in enumerate(synset_dataset_partial)}
idx_to_synset = {idx: synset for synset, idx in synset_to_idx.items()}

In [16]:
vocab_size = len(vocabulary)
synset_size = len(synset_dataset_partial)

In [17]:
CATEGORY_WEIGHTS = {
    "n": 1.0,
    "v": 0.85,
    "a": 0.8,
    "r": 0.65,
    "s": 0.8
}
# Fixed weights for each category

### Test 1: Using torch embedding model

In [18]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [19]:
class WordNetEmbeddings(nn.Module):
    def __init__(self, vocab_size, synset_size, embedding_dim):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.synset_embeddings = nn.Embedding(synset_size, embedding_dim)

    def forward(self, word_indices, synset_indices):
        word_vecs = self.word_embeddings(word_indices)
        synset_vecs = self.synset_embeddings(synset_indices)
        return word_vecs, synset_vecs


In [20]:
EPOCHS = 10
LEARNING_RATE = 0.01



In [21]:
model = WordNetEmbeddings(vocab_size, synset_size, EMBEDDING_DIM)
criterion = nn.CosineEmbeddingLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9)



In [22]:
def train_model(model, data, epochs):
    for epoch in range(epochs):
        total_loss = 0
        for entry in data:
            print(entry)
            gloss_tokens = entry["definition_tokens"]
            example_tokens = entry["examples_tokens"]
            all_tokens = gloss_tokens + example_tokens

            word_indices = torch.tensor([words_to_idx[word] for word in all_tokens if word in words_to_idx])
            synset_index = torch.tensor([synset_to_idx[entry["name"]]])

            category = entry["pos"]
            weight = CATEGORY_WEIGHTS.get(category, 1.0)

            word_vecs, synset_vecs = model(word_indices, synset_index)
            target = torch.ones(word_vecs.shape[0])  
            loss = criterion(word_vecs, synset_vecs.expand_as(word_vecs), target) * weight

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss}")


In [None]:
train_model(model, synset_dataset_partial, EPOCHS)