# Imports

In [None]:
from datasets import load_dataset_builder, load_dataset
import pandas as pd
import csv
import torch
import string
import tqdm

# Load dataset

In [None]:
dataset = load_dataset("ms_marco", 'v1.1', split="validation")
# dataset = load_dataset("ms_marco", 'v1.1', split="train")
df_train = pd.DataFrame(dataset)

# Tokenise

Steps taken:
- pip install sentencepiece
- prepare data in required format (csv, new line per sentence)
- run sentencepiece on corpus, to generate tokens
- run sentencepiece embedding on sentences, to convert to tokens

In [None]:
# Open a CSV file for writing
from tokenizer import prepare_sentencepiece_dataset

# Write a csv file to disk, in the format expected by the SentencePieceTrainer
prepare_sentencepiece_dataset(df_train, output_file = 'sentence_piece_input.csv')


In [None]:
import sentencepiece as spm
from tokenizer import train_sentencepiece
# Define parameters for SP training
input = 'sentence_piece_input.csv'
model_prefix = 'mymodel'
vocab_size = 4000
character_coverage = 0.9995
model_type = 'unigram'

train_sentencepiece(input, model_prefix, vocab_size, character_coverage, model_type)

# # Train the model
# spm.SentencePieceTrainer.Train(' '.join([f'--{k}={v}' for k, v in train_args.items()]))

print("Model trained and saved as mymodel.model and mymodel.vocab!")

In [None]:
import sentencepiece as spm
# Load the trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.Load('mymodel.model')
# Read in prepared SP input
sentence_piece_input = pd.read_csv('sentence_piece_input.csv', header =None, names = ['sentence'])
# Tokenize each sentence into tokens and token ids
sentence_piece_input['tokenized'] = sentence_piece_input['sentence'].apply(lambda x: sp.EncodeAsPieces(str(x)))
sentence_piece_input['tokenized_ids'] = sentence_piece_input['sentence'].apply(lambda x: sp.EncodeAsIds(str(x)))
sentence_piece_input.to_csv('ms_marco_tokenised.csv')

# Output token embeddings

## Run word2vec on tokenised corpus

W2V steps:
- generate CBOW table
- initialise embedding matrix and linear layer
- for each loop:
    - grab embedding vectors for context words
    - sum into one embedding vector
    - multiply by linear layer
    - softmax the result
    - calc loss against target
    - backprop
  

## CBOW table

In [None]:
from two_tower_datasets import W2VData
dataset = W2VData(sentence_piece_input, 5)

In [None]:
# Examine number of rows in W2V CBOW data
len(dataset)

In [None]:
# Set a high batch size for the data loader
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1024, shuffle=True)

## W2V for loop:

In [None]:
from model import CBOW
vocab_size = sp.GetPieceSize()
# Initialise CBOW model (vocab_size x embedding_dim)
cbow = CBOW(vocab_size, 50)
loss_function = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

In [None]:
# Examine number of batches in dataloader
print (len(dataloader))

## Train W2V

### Check CUDA

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
from train import train_cbow
# Run CBOW training, to get embedding matrix
# This will be passed to two-tower model
train_cbow(n_epochs=1, model=cbow, loss_function=loss_function, optimizer=optimizer, dataloader=dataloader)

Now we have trained an embedding matrix, via the CBOW method, to give us an (vocab_size, embedding_dim) matrix. We have two options now:
1. Use an RNN/LSTM to convert these token embeddings into sentence embeddings, for all of our query and document sentences. Follow this up with a two-tower architecture.
2. Skip the sentence embedding step, and use the embedding matrix directly in a two-tower (RNN/LSTM) architecture. 

I'm leaning towards the latter, because, time constraints, less complex architecture, and possibly improved performance, at the cost of training time (I think).


# Token -> sentence embeddings

Skip this. 

# PCA(?) to reduce dimensionality of sentence embeddings?

Skipped

# Two towers -> trained two tower architecure

## Create dataset to input to two tower

At each loop, we're going to need:
1. The query
2. The sentence 
3. The document the sentence belongs to 
4. The label (0 or 1 if Bing returned the doc for the query)

In [None]:
import pandas as pd
from two_tower_datasets import two_tower_dataset
# Reload MS Marco dataset, to create two-tower dataset
# dataset = load_dataset("ms_marco", 'v1.1', split="train")
dataset = load_dataset("ms_marco", 'v1.1', split="validation")
df_train = pd.DataFrame(dataset)
print (len(df_train))
result_df = two_tower_dataset(df_train)

In [None]:
print (len(result_df))

In [None]:
result_df.head(20)

In [None]:
result_df['is_selected'].sum()

## Tokenise the queries and passage texts

In [None]:
result_df['query'] = result_df['query'].apply(lambda x: sp.EncodeAsIds(str(x)))
result_df['passage_text'] = result_df['passage_text'].apply(lambda x: sp.EncodeAsIds(str(x)))

## Convert this to a dataset, then a dataloader

In [None]:
from torch.utils.data import DataLoader
from two_tower_datasets import TwoTowerData, collate_fn, pad_sequence
from torch.utils.data import DataLoader

two_tower_dataset = TwoTowerData(result_df)
batch_size = 512
two_tower_dataloader = DataLoader(two_tower_dataset, batch_size = batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
len(two_tower_dataloader)

# Define two-tower model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from model import TwoTowerModel, CBOW
from loss import contrastive_loss
from train import train_two_tower

# Load CBOW model
embedding_weights = cbow.embeddings.weight.data.detach()
# Initialise two-tower model
model = TwoTowerModel(embedding_matrix=torch.tensor(embedding_weights), hidden_size=128, output_size=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
n_epochs_two_tower = 2

# Run two-tower training
train_two_tower(n_epochs_two_tower, model, contrastive_loss, optimizer, two_tower_dataloader)


To do: 
Cast all the queries and sentences into embedding space. 
Then feed in batches as you've got now. 

# Test performance

In [None]:
import torch
import torch.nn.functional as F
from inference import create_offline_sentence_embeddings

# Ensure model is in evaluation mode
model.eval()
# torch.save(model, 'two_tower.pth')

# Test
sentences = list(sentence_piece_input['sentence'].values)
tokenizer = sp

offline_embeddings_dict = create_offline_sentence_embeddings(sentences, model, tokenizer)

In [None]:
import json
converted_dict = {k: [v] if not isinstance(v, list) else v for k, v in offline_embeddings_dict.items()}

with open('offline_embeddings_dict.json', 'w') as f:
    json.dump(converted_dict, f)

In [None]:
from inference import get_query_embedding, compute_similarities

query = "Service Technician Salary. Service Technician average salary is $42,052, median salary is $40,000 with a salary range from $20,000 to $100,000"
query_embedding = get_query_embedding(query, model, tokenizer)
similarities = compute_similarities(query_embedding, offline_embeddings_dict, model, tokenizer)

# Get top 10 matches (adjust as needed)
sorted_indices = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
top_matches = sorted_indices[:10]

for i in top_matches:
    print(i)

In [None]:
('Fromage a Raclette. Raclette is a semi-hard cheese made on both sides of the French and Swiss Alps. Valais Raclette or Fromage a Raclette, as they are traditionally called, are made using ancestral methods with unpasteurised milk of cows grazing on the alpine meadows. While Switzerland supplies 80% of Raclettes, French Raclettes are slightly softer with a smooth and creamy flavour. Raclette is also the name of a Swiss dish where the cheese is melted in front of a fire or a special machine and the melted parts are scraped onto diner’s plates. It is then served with small potatoes, gherkins, pickled onions and air dried meat called Viande des Grison', 0.8666157126426697)
('Sutton is an English-language surname of England and Ireland. One origin is from Anglo-Saxon where it is derived from sudh, suth, or suð, and tun referring to the generic placename Southtown. Note that almost every county in England contains one or more placenames bearing the prefix Sutton. The Domesday Book (1086) contains the first recorded spelling of the surname as Ketel de Sudtone; Suttuna also appeared in 1086 in records from Ely, Cambridgeshire. In 1379 tax records, the surname appears as de Sutton (of Southtown). One source refers to the origin as being Anglo-Norman, with the name itself derived as described above, from Anglo-Saxon terms. Related surnames include early variants de Sudtone (1086), Suttuna (1086), de Sutton (1379), and de Sutu', 0.7647449970245361)