In [1]:
import json
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
import torch
import tqdm

In [41]:
with open('scraped_data.json', encoding='utf-8') as fh:
    data = json.load(fh)

In [42]:
model_name = "hkunlp/instructor-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of T5Model were not initialized from the model checkpoint at hkunlp/instructor-base and are newly initialized: ['decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.8.layer.1.EncDecAttention.v.weight', 'decoder.block.6.layer.1.layer_norm.weight', 'decoder.block.7.layer.1.EncDecAttention.k.weight', 'decoder.block.7.layer.0.layer_norm.weight', 'decoder.block.10.layer.0.SelfAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.k.weight', 'decoder.block.8.layer.2.DenseReluDense.wi.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.3.layer.1.EncDecAttention.k.weight', 'decoder.block.6.layer.1.EncDecAttention.v.weight', 'decoder.block.1.layer.1.layer_norm.weight', 'decoder.block.7.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.2.DenseReluDense.wi.weight', 'decoder.block.4.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.0.

In [43]:
def split_text(text, max_tokens=512):
    paragraphs = text.split('\n\n')
    tokenized_paragraphs = []
    for paragraph in paragraphs:
        inputs = tokenizer(paragraph, return_tensors='pt', truncation=True, max_length=max_tokens, padding=True)
        tokens = inputs['input_ids'][0].shape[0]
        if tokens <= max_tokens:
            tokenized_paragraphs.append(paragraph)
        else:
            sentences = paragraph.split('. ')
            current_chunk = ""
            for sentence in sentences:
                current_chunk += sentence + ". "
                if tokenizer(current_chunk, return_tensors='pt', truncation=True, max_length=max_tokens)['input_ids'][
                    0].shape[0] > max_tokens:
                    tokenized_paragraphs.append(current_chunk.strip())
                    current_chunk = sentence + ". "
            if current_chunk:
                tokenized_paragraphs.append(current_chunk.strip())
    return tokenized_paragraphs

In [44]:
def generate_embedding(text, max_length=512):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids

    with torch.no_grad():
         outputs = model(input_ids=inputs['input_ids'], decoder_input_ids=decoder_input_ids)
         embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
         return embedding.cpu().numpy()

In [45]:
embeddings = []
metadata = []

with tqdm.tqdm(total=len(data)) as pbar:
    for doc in data:

        if not doc['markdown']:
            continue

        chunks = split_text(doc['markdown'])

        for chunk in chunks:
            
            if(len(chunk) < 100):
                continue

            embedding = generate_embedding(f"{doc['title']}: {chunk}")
            embeddings.append(embedding)

            metadata.append({
                "url": doc["url"],
                "title": doc["title"],
                "chunk": chunk
            })

        pbar.update(1)

embeddings = np.array(embeddings)


 99%|█████████▉| 96/97 [00:55<00:00,  1.74it/s]


In [46]:
embeddings

array([[ 0.17698021, -0.03525208,  0.09637368, ...,  0.04824408,
         0.4126371 , -0.03036477],
       [ 0.1773256 , -0.03054882,  0.09689938, ...,  0.04768535,
         0.4106331 , -0.03188986],
       [ 0.17503065, -0.03578071,  0.09726293, ...,  0.04858001,
         0.41190344, -0.03124303],
       ...,
       [ 0.17224975, -0.03312673,  0.08933447, ...,  0.04262705,
         0.4087742 , -0.02835188],
       [ 0.17470224, -0.0325473 ,  0.09625758, ...,  0.048962  ,
         0.4103149 , -0.03362304],
       [ 0.17319512, -0.03420646,  0.09580311, ...,  0.05018023,
         0.411121  , -0.02943579]], dtype=float32)

In [49]:
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

from faiss import write_index

write_index(index, "rag.index")

In [50]:
query = "Looking for ERV information"
query = "Good morning,  thanks for the info. Would be possible to get a reimbursement in cash instead if coupon?  Many thanks,  <FIRST_NAME_1>."
query_embedding = generate_embedding(query)

In [51]:
D, I = index.search(query_embedding.reshape(1, -1), k=2)

In [52]:
for idx in I[0]:
    doc_metadata = metadata[idx]


    print(f"Doc metadata: ", doc_metadata)


Doc metadata:  {'url': 'https://www.datasport.com/en/erv/', 'title': 'ERV - Datasport', 'chunk': 'If you prefer a cash payment, please let us know by e-mail\n(info@datasport.com) after receiving the voucher and we will refund the\nvoucher amount minus a small fee.'}
Doc metadata:  {'url': 'https://www.datasport.com/en/for-athletes/faq/', 'title': 'FAQ - frequently asked questions - questions and answers - Datasport', 'chunk': 'If you prefer a cash payment, please let us know by e-mail\n(info@datasport.com) after receiving the voucher and we will refund the\nvoucher amount minus a small fee.'}
