# Imports

In [None]:
from datasets import load_dataset_builder, load_dataset
import pandas as pd
import csv
import torch
import string
import tqdm


# Load dataset

In [None]:
# ms_marco = load_dataset_builder("ms_marco", 'v1.1')
dataset = load_dataset("ms_marco", 'v1.1', split="train")
df_train = pd.DataFrame(dataset)
df_train

# Tokenise

Steps taken:
- pip install sentencepiece
- prepare data in required format (csv, new line per sentence)
- run sentencepiece on corpus, to generate tokens
- run sentencepiece embedding on sentences, to convert to tokens

In [None]:
df_train.loc[0]['passages']['passage_text']

In [None]:
# Open a CSV file for writing
with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)

    # 1. Iterate over the rows of the dataframe
    for _, row in df_train.iterrows():
        # 2. For each row, access the passages column and then the passage_text key
        passage_texts = row['passages']['passage_text']

        # 3. Write each string from every list to the CSV file
        for text in passage_texts:
            writer.writerow([text])

In [None]:
sentence_piece_input = pd.read_csv('output.csv', header =None, names = ['sentence'])

In [None]:
sentence_piece_input

In [None]:
sentence_piece_input = sentence_piece_input.sample(n = 10000)
sentence_piece_input.to_csv('sentence_piece_input_10k.csv')

In [None]:
import sentencepiece as spm

# Define parameters for training
train_args = {
    'input': 'sentence_piece_input_10k.csv',             # Input file
    'model_prefix': 'mymodel',        # Prefix for the output model files (.model and .vocab)
    'vocab_size': 8000,              # Size of the vocabulary
    'character_coverage': 0.9995,     # Character coverage to be considered for the model. Good defaults are: 0.9995 for languages with rich character sets like Japanese or Chinese and 0.9997 for others
    'model_type': 'unigram',          # Model type can be 'unigram' (default), 'bpe', 'char', or 'word'
    # Add other parameters as needed.
}

# Train the model
spm.SentencePieceTrainer.Train(' '.join([f'--{k}={v}' for k, v in train_args.items()]))

print("Model trained and saved as mymodel.model and mymodel.vocab!")

In [None]:
# Path to the vocab file
vocab_file = 'mymodel.vocab'

# Read and display the first N lines of the vocab file
N = 1000  # Change this value to see more or fewer tokens
with open(vocab_file, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        if i >= N:
            break
        print(line.strip())

In [None]:
import sentencepiece as spm

# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load('mymodel.model')

In [None]:
sentence_piece_input['tokenized'] = sentence_piece_input['sentence'].apply(lambda x: sp.EncodeAsPieces(str(x)))
sentence_piece_input['tokenized_ids'] = sentence_piece_input['sentence'].apply(lambda x: sp.EncodeAsIds(str(x)))


In [None]:
sentence_piece_input.to_csv('ms_marco_sample_tokenised.csv')

In [None]:
sentence_piece_input

In [None]:
sentence_piece_input['sentence'].to_list()

# Output token embeddings

## Run word2vec on tokenised corpus

In [None]:
# class W2VData(torch.utils.data.Dataset):
#     def __init__(self, corpus, window_size=2):
#         self.tokenized_corpus = corpus
#         self.data = []
#         self.create_data(window_size)
#         self.build_vocab()

#     def build_vocab(self):
#         # Flatten the tokenized corpus and get unique tokens
#         tokens = [token for sublist in self.tokenized_corpus for token in sublist]
#         self.vocab = set(tokens)
#         # Create token to index mapping
#         self.token2idx = {token: idx for idx, token in enumerate(self.vocab)}
#         self.idx2token = {idx: token for token, idx in self.token2idx.items()}

#     def create_data(self, window_size):
#         for tokens in self.tokenized_corpus:
#             for i, target in enumerate(tokens):
#                 context = (
#                     tokens[max(0, i - window_size) : i]
#                     + tokens[i + 1 : i + window_size + 1]
#                 )
#                 if len(context) != 2 * window_size:
#                     continue
#                 self.data.append((context, target))

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         context, target = self.data[idx]
#         # Convert tokens to indices
#         context_indices = [self.token2idx[token] for token in context]
#         target_index = self.token2idx[target]
#         return torch.tensor(context_indices), torch.tensor(target_index)
    
# class CBOW(torch.nn.Module):
#     def __init__(self, vocab_size, embedding_dim):
#         super(CBOW, self).__init__()
#         self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
#         self.linear = torch.nn.Linear(embedding_dim, vocab_size)

#     def forward(self, inputs):
#         embeds = torch.sum(self.embeddings(inputs), dim=1)
#         out = self.linear(embeds)
#         log_probs = torch.nn.functional.log_softmax(out, dim=1)
#         return log_probs
    
# data = pd.read_csv('ms_marco_tokenised.csv')
# ds = W2VData(data)
# dl = torch.utils.data.DataLoader(ds, batch_size=1, shuffle=True)

# for batch in dl:
#     print(batch)




# vocab_size = len(ds.tknz.vocab)
# lang = model.Language(torch.rand(vocab_size, 50), 7)
# loss_function = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(lang.parameters(), lr=0.001)
# torch.save(lang.state_dict(), f"./lang_epoch_0.pt")


# for epoch in range(5):
#     for sentence, target, _ in dl:
#         optimizer.zero_grad()
#         log_probs = lang(sentence)
#         loss = loss_function(log_probs, target)
#         loss.backward()
#         optimizer.step()
#         print(f"Epoch {epoch+1}/5, Loss: {loss.item()}")
#     torch.save(lang.state_dict(), f"./lang_epoch_{epoch+1}.pt")

W2V steps:
- generate CBOW table
- initialise embedding matrix and linear layer
- for each loop:
    - grab embedding vectors for context words
    - sum into one embedding vector
    - multiply by linear layer
    - softmax the result
    - calc loss against target
    - backprop
  

In [None]:
sentence_piece_input_test = sentence_piece_input[0:10]

## CBOW table

In [None]:
class W2VData(torch.utils.data.Dataset):
    def __init__(self, corpus, window_size=2):
        self.corpus = corpus
        self.data = []
        self.create_data(window_size)

    def create_data(self, window_size):
        for index, row in self.corpus.iterrows():
            print (index)
            tokens = row['tokenized_ids']
            for i, target in enumerate(tokens):
                context = (
                    tokens[max(0, i - window_size) : i]
                    + tokens[i + 1 : i + window_size + 1]
                )
                if len(context) != 2 * window_size:
                    continue
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return torch.tensor(context), torch.tensor(target)

dataset = W2VData(sentence_piece_input, 2)


# data = []
# window_size = 2
# for index, row in sentence_piece_input.iterrows():
#     print (index)
#     tokens = row['tokenized_ids']
#     for i, target in enumerate(tokens):
#         context = (
#             tokens[max(0, i - window_size) : i]
#             + tokens[i + 1 : i + window_size + 1]
#         )
#         if len(context) != 2 * window_size:
#             continue
#         data.append((context, target))

In [None]:
len(dataset)

In [None]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)


## Initialise embedding and linear layer

In [None]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = torch.sum(self.embeddings(inputs), dim=1)
        out = self.linear(embeds)
        log_probs = torch.nn.functional.log_softmax(out, dim=1)
        return log_probs
    


## W2V for loop:

In [None]:
vocab_size = sp.GetPieceSize()
cbow = CBOW(vocab_size, 50)
loss_function = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

In [None]:
print (len(dataloader))

## Train W2V

In [None]:
for epoch in range(2):
    total_loss = 0
    for context, target in tqdm.tqdm(dataloader, desc=f"Epoch {epoch+1}/2", unit="batch"):
        optimizer.zero_grad()
        log_probs = cbow(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/2, Loss: {total_loss}")
    torch.save(cbow.state_dict(), f"./cbow_epoch_{epoch+1}.pt")

Now we have trained an embedding matrix, via the CBOW method, to give us an (vocab_size, embedding_dim) matrix. We have two options now:
1. Use an RNN/LSTM to convert these token embeddings into sentence embeddings, for all of our query and document sentences. Follow this up with a two-tower architecture.
2. Skip the sentence embedding step, and use the embedding matrix directly in a two-tower (RNN/LSTM) architecture. 

I'm leaning towards the latter, because, time constraints, less complex architecture, and possibly improved performance, at the cost of training time (I think).


# Token -> sentence embeddings

Skip this. 

# PCA(?) to reduce dimensionality of sentence embeddings?

Skipped

# Two towers -> trained two tower architecure

## Create dataset to input to two tower

At each loop, we're going to need:
1. The query
2. The sentence 
3. The document the sentence belongs to 
4. The label (0 or 1 if Bing returned the doc for the query)

In [None]:
import pandas as pd

dataset = load_dataset("ms_marco", 'v1.1', split="train")
df_train = pd.DataFrame(dataset)
df_train = df_train.sample(n = 10000)

# df_train = df_train.loc[:10,:]

new_rows = []

# Iterate through rows and expand
for index, row in df_train.iterrows():
    print (index/len(df_train))
    query = row['query']
    for is_selected, passage in zip(row['passages']['is_selected'], row['passages']['passage_text']):
        new_rows.append({
            'query': query,
            'is_selected': is_selected,
            'passage_text': passage
        })

# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(new_rows)



In [None]:
print (len(result_df))


## Tokenise the queries and passage texts

In [None]:
result_df['query'] = result_df['query'].apply(lambda x: sp.EncodeAsIds(str(x)))
result_df['passage_text'] = result_df['passage_text'].apply(lambda x: sp.EncodeAsIds(str(x)))


In [None]:
result_df

## Convert this to a dataset, then a dataloader

In [None]:
from torch.utils.data import DataLoader

class TwoTowerData(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        query = self.dataframe.iloc[index]['query']
        is_selected = self.dataframe.iloc[index]['is_selected']
        passage_text = self.dataframe.iloc[index]['passage_text']
        
        
        return torch.tensor(query), torch.tensor(is_selected), torch.tensor(passage_text)
    
from torch.utils.data import DataLoader

two_tower_dataset = TwoTowerData(result_df)
batch_size = 1000


def pad_sequence(sequences, padding_value=0):
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    max_len = max([s.size(0) for s in sequences])
    out_dims = (len(sequences), max_len) + trailing_dims
    
    padded_sequences = sequences[0].data.new(*out_dims).fill_(padding_value)
    for i, sequence in enumerate(sequences):
        length = sequence.size(0)
        padded_sequences[i, :length, ...] = sequence
    return padded_sequences

def collate_fn(batch):
    queries = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    sentences = [item[2] for item in batch]
    
    return pad_sequence(queries), torch.stack(labels), pad_sequence(sentences)


two_tower_dataloader = DataLoader(two_tower_dataset, batch_size = batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
len(two_tower_dataloader)

In [None]:
for batch_idx, batch in enumerate(two_tower_dataloader):
    print(f"Batch {batch_idx + 1}:")
    
    print (batch)
    
    # To prevent printing all batches (if you have many), you can break after a few:
    if batch_idx >= 2:  # Change this number as needed
        break

In [None]:
example = DataLoader(two_tower_dataset, batch_size = batch_size, shuffle=True)

iter_obj = iter(example)
next(iter_obj)
da

In [None]:
two_tower_dataset.__getitem__(0)[2].size()

In [None]:
two_tower_dataset.__getitem__(1)[2].size()

# Define two-tower model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class TwoTowerModel(nn.Module):
    def __init__(self, embedding_matrix, hidden_size, output_size):
        super(TwoTowerModel, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        
        # LSTM layers for query and sentence
        self.query_lstm = nn.LSTM(embedding_matrix.size(1), hidden_size, batch_first=True)
        self.sentence_lstm = nn.LSTM(embedding_matrix.size(1), hidden_size, batch_first=True)
        
        # Dense layer to produce final embeddings
        self.query_dense = nn.Linear(hidden_size, output_size)
        self.sentence_dense = nn.Linear(hidden_size, output_size)

    def forward(self, query, sentence):
        query_embed = self.embedding(query)
        sentence_embed = self.embedding(sentence)
        
        _, (query_hidden, _) = self.query_lstm(query_embed)
        _, (sentence_hidden, _) = self.sentence_lstm(sentence_embed)
        
        query_vector = self.query_dense(query_hidden.squeeze(0))
        sentence_vector = self.sentence_dense(sentence_hidden.squeeze(0))
        
        return query_vector, sentence_vector

# Loss function
def contrastive_loss(query_vector, sentence_vector, label, margin=0.5):
    # Cosine similarity
    sim = F.cosine_similarity(query_vector, sentence_vector, dim=1)
    
    # Loss computation
    loss = (1 - label) * torch.pow(sim, 2) + label * torch.pow(F.relu(margin - sim), 2)
    return loss.mean()

# Example use:
embedding_weights = cbow.embeddings.weight.data.detach()
model = TwoTowerModel(embedding_matrix=torch.tensor(embedding_weights), hidden_size=128, output_size=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 2

# Training loop example:
for epoch in range(epochs):
    i = 1
    for batch_query, label, batch_sentence in two_tower_dataloader: # your data loader here
        i +=1 
        print (i / len(two_tower_dataloader))
        optimizer.zero_grad()
        
        query_vector, sentence_vector = model(batch_query, batch_sentence)
        loss = contrastive_loss(query_vector, sentence_vector, label)
        
        loss.backward()
        optimizer.step()
        


To do: 
Cast all the queries and sentences into embedding space. 
Then feed in batches as you've got now. 

# Test performance

In [None]:
import torch
import torch.nn.functional as F

# Assuming your tokenizer and preprocessing method:
def tokenize_and_tensorize(text, tokenizer, max_length=100):
    # You might need to adjust this based on your tokenizer and preprocessing
    tokens = tokenizer.EncodeAsIds(str(text))
    return torch.tensor(tokens).unsqueeze(0)  # Add batch dimension

def get_query_embedding(query, model, tokenizer):
    query_tensor = tokenize_and_tensorize(query, tokenizer)
    with torch.no_grad():
        query_embedding, _ = model(query_tensor, query_tensor)  # We're only interested in the query's embedding
    return query_embedding

def compute_similarities(query_embedding, sentences, model, tokenizer):
    similarities = []
    with torch.no_grad():
        i=0
        for sentence in sentences:
            i+=1
            print (i/len(sentences))
            sentence_tensor = tokenize_and_tensorize(sentence, tokenizer)
            # Using dummy tensor for query since we only want the sentence embedding
            _, sentence_embedding = model(sentence_tensor, sentence_tensor)
            # Compute cosine similarity
            cosine_similarity = F.cosine_similarity(query_embedding, sentence_embedding)
            similarities.append(cosine_similarity.item())
    return similarities

# Ensure your model is in evaluation mode
model.eval()

# Test
query = "The rental fee ranges from $1,600 to $3,000 for reception and includes 5 hours of event time excluding set up and clean up time. The fee to rent the venue for a wedding ceremony ranges from $1,200 to $1,500 with reception rental. Additional hours can be arranged for a fee of $350/hr. Event time varies depending on time of event. Daytime Weddings 10:00AM-4:00PM & Evening Weddings 5:00PM-11:00PM. The rental fee ranges from $1,600 to $3,000 for reception and includes 5 hours of event time excluding set up and clean up time. The fee to rent the venue for a wedding ceremony ranges from $1,200 to $1,500 with reception rental"
sentences = list(sentence_piece_input['sentence'].values)
tokenizer = sp

query_embedding = get_query_embedding(query, model, tokenizer)
similarities = compute_similarities(query_embedding, sentences, model, tokenizer)

# Get top 10 matches (adjust as needed)
sorted_indices = sorted(range(len(similarities)), key=lambda k: -similarities[k])
top_matches = [sentences[i] for i in sorted_indices[:10]]

for i in top_matches:
    print(i)
    print ('-----------------------------------------------------')