# Word2Vec (Skipgram )

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib
import matplotlib.pyplot as plt
import time
import nltk
from nltk.corpus import reuters
import json 
import string

# Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')

class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss
        
word2index_path = './config_model_files/word2index.json'  
index2word_path = './config_model_files/index2word.json' 
model_path = './config_model_files/word2vec_model.pth'
config_path = './config_model_files/word2vec_config.json'
corpus_path =  './config_model_files/corpus.txt'
with open(word2index_path, 'r') as file:
    word2index = json.load(file)  # Load the word2index dictionary from the JSON file

with open(index2word_path, 'r') as file:
    index2word = json.load(file)  # Load the index2word dictionary from the JSON file
    
# Load the model's configuration from a JSON file
with open(config_path, 'r') as config_file:
    config = json.load(config_file)

# Retrieve the configuration values
voc_size = config['voc_size']  # Vocabulary size
emb_size = config['emb_size']  # Embedding size

# Initialize a new Word2Vec model with the loaded configuration
loaded_model = Skipgram(voc_size, emb_size)

# Load the state dictionary (model parameters) into the initialized model
loaded_model.load_state_dict(torch.load(model_path))

# Set the model to evaluation mode (useful for inference)
loaded_model.eval()

# Confirm successful model loading
print("Model loaded successfully")


# Define the file path to your corpus text file
file_path = corpus_path  # Replace with your actual file path

def load_corpus(file_path):
    corpus = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip removes leading/trailing whitespace
            line = line.strip()
            # print(line)
            if line:  # Add non-empty lines to the corpus
                corpus.append(line)
    return corpus

# Load the corpus
corpus = load_corpus(file_path)

def preprocess(text):
    # Tokenizes the text into words and converts all characters to lowercase
    tokens = nltk.word_tokenize(text.lower())
    return tokens

def get_embedding(text, model, word2index):
    """
    Converts a text input to its corresponding average embedding.
    """
    tokens = preprocess(text)  # Preprocess the text to get tokens
    embeddings = []
    for token in tokens:
        index = word2index.get(token, word2index.get('<UNK>'))
        word_tensor = torch.LongTensor([index])

        embed_center = model.embedding_center(word_tensor)
        embed_outside = model.embedding_outside(word_tensor)
        embed = (embed_center + embed_outside) / 2
        embeddings.append(embed.detach().numpy())
    
    # Average the embeddings
    if embeddings:
        embeddings = np.array(embeddings)
        text_embedding = np.mean(embeddings, axis=0)
    else:
        text_embedding = np.zeros(model.embedding_center.weight.shape[1])
    
    # Make sure the embedding is a 1-D array
    text_embedding = text_embedding.flatten()  # Flatten the array to ensure it's 1-D
    
    return text_embedding

def retrieve_top_passages(query, corpus, model, word2index, top_n=10):
    """
    Computes the dot product between the input query and each passage in the corpus,
    and retrieves the top N most similar passages.
    """
    query_embedding = get_embedding(query, model, word2index)
    similarities = []

    for passage in corpus:
        passage_embedding = get_embedding(passage, model, word2index)
        similarity = np.dot(query_embedding, passage_embedding)
        similarities.append(similarity)

    
    sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)
    top_indices = sorted_indices[:top_n]
    
    # Normalize the scores to be percentages of the max score
    # max_score = max([similarities[idx] for idx in top_indices])
    # top_passages = [(corpus[idx], (similarities[idx] / max_score) * 100) for idx in top_indices]
 
    top_passages = [(corpus[idx], (similarities[idx]) * 100) for idx in top_indices]
    
    return top_passages

# ... [Load corpus and run example usage here as in your code] ...

# Example usage with the retrieve_top_passages function
query = "taiwan had a trade trade surplus of how much billion dlrs last year .".lower()
print(query)
top_passages = retrieve_top_passages(query, corpus, loaded_model, word2index, top_n=10)
i = 1
for passage, score in top_passages:
    print(i, passage, score)
    i+= 1

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shafisourov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model loaded successfully
taiwan had a trade trade surplus of how much billion dlrs last year .
1 the paper gave no further details . 4.435082897543907
2 no decisions are likely until after indonesia ' s elections on april 23 , traders said . 2.5840189307928085
3 much more serious for hong kong is the disadvantage of action restraining trade , '' he said . 1.8180981278419495
4 nainggolan said that the exchange was trying to boost overseas interest by building up contacts with end - users . 1.775369793176651
5 but other businessmen said such a short - term commercial advantage would be outweighed by further u . s . pressure to block imports . 1.6946449875831604
6 `` we are aware of the seriousness of the u . s . 1.6228081658482552
7 the analysts agreed the bank was aggressive . 1.6163814812898636
8 miti is planning to work out a revised energy supply / demand outlook through deliberations of committee meetings of the agency of natural resources and energy , the officials said . 1.510283

In [7]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib
import matplotlib.pyplot as plt
import time
import nltk
from nltk.corpus import reuters
import json 
import string

# Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')

class SkipgramNeg(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)
        
word2index_path = './config_model_files/word2index_neg_sam.json'  
index2word_path = './config_model_files/index2word_neg_sam.json' 
model_path = './config_model_files/word2vec_model_neg_sam.pth'
config_path = './config_model_files/word2vec_config_neg_sam.json'
corpus_path =  './config_model_files/corpus_neg_sam.txt'
with open(word2index_path, 'r') as file:
    word2index = json.load(file)  # Load the word2index dictionary from the JSON file

with open(index2word_path, 'r') as file:
    index2word = json.load(file)  # Load the index2word dictionary from the JSON file
    
# Load the model's configuration from a JSON file
with open(config_path, 'r') as config_file:
    config = json.load(config_file)

# Retrieve the configuration values
voc_size = config['voc_size']  # Vocabulary size
emb_size = config['emb_size']  # Embedding size

# Initialize a new Word2Vec model with the loaded configuration
loaded_model = SkipgramNeg(voc_size, emb_size)

# Load the state dictionary (model parameters) into the initialized model
loaded_model.load_state_dict(torch.load(model_path))

# Set the model to evaluation mode (useful for inference)
loaded_model.eval()

# Confirm successful model loading
print("Model loaded successfully")


# Define the file path to your corpus text file
file_path = corpus_path  # Replace with your actual file path

def load_corpus(file_path):
    corpus = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip removes leading/trailing whitespace
            line = line.strip()
            # print(line)
            if line:  # Add non-empty lines to the corpus
                corpus.append(line)
    return corpus

# Load the corpus
corpus = load_corpus(file_path)

def preprocess(text):
    # Tokenizes the text into words and converts all characters to lowercase
    tokens = nltk.word_tokenize(text.lower())
    return tokens

def get_embedding(text, model, word2index):
    """
    Converts a text input to its corresponding average embedding.
    """
    tokens = preprocess(text)  # Preprocess the text to get tokens
    embeddings = []
    for token in tokens:
        index = word2index.get(token, word2index.get('<UNK>'))
        word_tensor = torch.LongTensor([index])

        embed_center = model.embedding_center(word_tensor)
        embed_outside = model.embedding_outside(word_tensor)
        embed = (embed_center + embed_outside) / 2
        embeddings.append(embed.detach().numpy())
    
    # Average the embeddings
    if embeddings:
        embeddings = np.array(embeddings)
        text_embedding = np.mean(embeddings, axis=0)
    else:
        text_embedding = np.zeros(model.embedding_center.weight.shape[1])
    
    # Make sure the embedding is a 1-D array
    text_embedding = text_embedding.flatten()  # Flatten the array to ensure it's 1-D
    
    return text_embedding

def retrieve_top_passages(query, corpus, model, word2index, top_n=10):
    """
    Computes the dot product between the input query and each passage in the corpus,
    and retrieves the top N most similar passages.
    """
    query_embedding = get_embedding(query, model, word2index)
    similarities = []

    for passage in corpus:
        passage_embedding = get_embedding(passage, model, word2index)
        similarity = np.dot(query_embedding, passage_embedding)
        similarities.append(similarity)

    
    sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)
    top_indices = sorted_indices[:top_n]
    
    # Normalize the scores to be percentages of the max score
    # max_score = max([similarities[idx] for idx in top_indices])
    # top_passages = [(corpus[idx], (similarities[idx] / max_score) * 100) for idx in top_indices]
 
    top_passages = [(corpus[idx], (similarities[idx]) * 100) for idx in top_indices]
    
    return top_passages

# ... [Load corpus and run example usage here as in your code] ...

# Example usage with the retrieve_top_passages function
query = "taiwan had a trade trade surplus of how much billion dlrs last year .".lower()
print(query)
top_passages = retrieve_top_passages(query, corpus, loaded_model, word2index, top_n=10)
i = 1
for passage, score in top_passages:
    print(i, passage, score)
    i+= 1

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shafisourov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model loaded successfully
taiwan had a trade trade surplus of how much billion dlrs last year .
1 officials say the infant exchange has made a good start although trading in coffee has been disappointing . 5.8266885578632355
2 the fledgling exchange currently trades coffee and rubber physicals on an open outcry system four days a week . 5.520188808441162
3 physical rubber trading was launched in 1985 , with coffee added in january 1986 . 5.410360544919968
4 annual ore capacity will be about 750 , 000 tonnes . 4.828231409192085
5 the trade ministry and exchange board are considering the introduction of futures trading later for rubber , but one official said a feasibility study was needed first . 4.7112710773944855
6 he said teams had already been to south korea and taiwan to encourage direct use of the exchange , while a delegation would also visit europe , mexico and some latin american states to encourage participation . 4.592230170965195
7 trading in either crude palm oil ( cpo ) or

In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib
import matplotlib.pyplot as plt
import time
import nltk
from nltk.corpus import reuters
import json 
import string

# Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')

class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        # Embeddings for center words
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        # Embeddings for context (outside) words
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        # Bias terms for center words
        self.center_bias       = nn.Embedding(voc_size, 1) 
        # Bias terms for context (outside) words
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        # Retrieve the embeddings for the center words
        center_embeds  = self.center_embedding(center)  # (batch_size, 1, emb_size)
        # Retrieve the embeddings for the outside words
        outside_embeds = self.outside_embedding(outside)  # (batch_size, 1, emb_size)
        
        # Retrieve and squeeze the bias for the center words
        center_bias    = self.center_bias(center).squeeze(1)
        # Retrieve and squeeze the bias for the outside words
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        # Compute the dot product of center and outside word embeddings
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        # Compute the GloVe loss as the weighted squared error between
        # the log co-occurrence counts and the model predictions (dot product + biases)
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        # Return the sum of the losses for the batch
        return torch.sum(loss)

        
word2index_path = './config_model_files/word2index_Glove_Scratch.json'  
index2word_path = './config_model_files/index2word_Glove_Scratch.json' 
model_path = './config_model_files/word2vec_model_Glove_Scratch.pth'
config_path = './config_model_files/word2vec_config_Glove_Scratch.json'
corpus_path =  './config_model_files/corpus_Glove_Scratch.txt'
with open(word2index_path, 'r') as file:
    word2index = json.load(file)  # Load the word2index dictionary from the JSON file

with open(index2word_path, 'r') as file:
    index2word = json.load(file)  # Load the index2word dictionary from the JSON file
    
# Load the model's configuration from a JSON file
with open(config_path, 'r') as config_file:
    config = json.load(config_file)

# Retrieve the configuration values
voc_size = config['voc_size']  # Vocabulary size
emb_size = config['emb_size']  # Embedding size

# Initialize a new Word2Vec model with the loaded configuration
loaded_model = Glove(voc_size, emb_size)

# Load the state dictionary (model parameters) into the initialized model
loaded_model.load_state_dict(torch.load(model_path))

# Set the model to evaluation mode (useful for inference)
loaded_model.eval()

# Confirm successful model loading
print("Model loaded successfully")


# Define the file path to your corpus text file
file_path = corpus_path  # Replace with your actual file path

def load_corpus(file_path):
    corpus = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Strip removes leading/trailing whitespace
            line = line.strip()
            # print(line)
            if line:  # Add non-empty lines to the corpus
                corpus.append(line)
    return corpus

# Load the corpus
corpus = load_corpus(file_path)

def preprocess(text):
    # Tokenizes the text into words and converts all characters to lowercase
    tokens = nltk.word_tokenize(text.lower())
    return tokens

def get_embedding(text, model, word2index):
    """
    Converts a text input to its corresponding average embedding.
    """
    tokens = preprocess(text)  # Preprocess the text to get tokens
    embeddings = []
    for token in tokens:
        index = word2index.get(token, word2index.get('<UNK>'))
        word_tensor = torch.LongTensor([index])

        embed_center = model.center_embedding(word_tensor)
        embed_outside = model.outside_embedding(word_tensor)
        embed = (embed_center + embed_outside) / 2
        embeddings.append(embed.detach().numpy())
    
    # Average the embeddings
    if embeddings:
        embeddings = np.array(embeddings)
        text_embedding = np.mean(embeddings, axis=0)
    else:
        text_embedding = np.zeros(model.center_embedding.weight.shape[1])
    
    # Make sure the embedding is a 1-D array
    text_embedding = text_embedding.flatten()  # Flatten the array to ensure it's 1-D
    
    return text_embedding

def retrieve_top_passages(query, corpus, model, word2index, top_n=10):
    """
    Computes the dot product between the input query and each passage in the corpus,
    and retrieves the top N most similar passages.
    """
    query_embedding = get_embedding(query, model, word2index)
    similarities = []

    for passage in corpus:
        passage_embedding = get_embedding(passage, model, word2index)
        similarity = np.dot(query_embedding, passage_embedding)
        similarities.append(similarity)

    
    sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)
    top_indices = sorted_indices[:top_n]
    
    top_passages = [(corpus[idx], (similarities[idx]) * 100) for idx in top_indices]
    
    return top_passages

# ... [Load corpus and run example usage here as in your code] ...

# Example usage with the retrieve_top_passages function
# query = "taiwan had a trade surplus of how much billion dlrs last year"
query = "taiwan had a trade surplus of how much billion dlrs last year"
query = 'what other businessmen said for commercial advantage'
print(query)
top_passages = retrieve_top_passages(query, corpus, loaded_model, word2index, top_n=10)
i = 1
for passage, score in top_passages:
    print(i, passage, score)
    i+= 1

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shafisourov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Model loaded successfully
what other businessmen said for commercial advantage
1 miti is expected to lower the projection for primary energy supplies in the year 2000 to 550 mln kilolitres ( kl ) from 600 mln , they said . 12.003625929355621
2 but other businessmen said such a short - term commercial advantage would be outweighed by further u . s . pressure to block imports . 7.376070320606232
3 the fledgling exchange currently trades coffee and rubber physicals on an open outcry system four days a week . 7.233703136444092
4 the department said first quarter exports expanded to 60 . 6 billion baht from 56 . 6 billion . 7.018963992595673
5 `` we are aware of the seriousness of the u . s . 6.892098486423492
6 they said they could not say how long the disruption will go on and what effect it will have on shipping movements . 6.390486657619476
7 taiwan had a trade trade surplus of 15 . 6 billion dlrs last year , 95 pct of it with the u . s . 6.325379014015198
8 the country ' s oil import b