In [9]:
from model.twotowermodel import DocumentTower, QueryTower
from data.dataset import QueryDocumentDataset
import torch

document_model = DocumentTower(embedding_dim=300, hidden_dim=128)
query_model = QueryTower(embedding_dim=300, hidden_dim=128)

document_model.load_state_dict(torch.load('document_model_state_dict.pth'))
query_model.load_state_dict(torch.load('query_model_state_dict.pth'))


<All keys matched successfully>

In [2]:
document_model.eval()
query_model.eval()

QueryTower(
  (query_encoder): RNNEncoder(
    (rnn): GRU(300, 128, batch_first=True)
  )
)

In [10]:
from data.dataset import QueryDocumentDataset
from gensim.models import Word2Vec
import numpy as np
from torch import nn

model_path = "./artifacts/word2vec-300.bin"

model = Word2Vec.load(model_path)
dataset_instance = QueryDocumentDataset(data=[], embedding_model=model)


In [8]:
from gensim.utils import simple_preprocess

MAX_SEQ_LENGTH = 128  # Example maximum length

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device', device)

def get_docs_embedding(docs, batch_size=64):
    all_doc_encodings = []

    # Move your model to the appropriate device
    document_model.to(device)

    for i in range(0, len(docs), batch_size):
        batch_docs = docs[i:i + batch_size]
        
        batch_embeddings = []
        for doc in batch_docs:
            tokenized_doc = simple_preprocess(doc)
            doc_embeddings = torch.tensor([model.wv[word] for word in tokenized_doc if word in model.wv],dtype=torch.float32)

            if len(doc_embeddings) > MAX_SEQ_LENGTH:
                doc_embeddings = doc_embeddings[:MAX_SEQ_LENGTH]
            else:
                padding = MAX_SEQ_LENGTH - len(doc_embeddings)
                pad_tensor = torch.zeros((padding, model.vector_size), dtype=torch.float32)
                doc_embeddings = torch.cat((doc_embeddings, pad_tensor), dim=0)
            batch_embeddings.append(doc_embeddings)

        embeddings_tensor = torch.stack(batch_embeddings) if len(batch_embeddings) > 1 else batch_embeddings[0].unsqueeze(0)  # Ensure batch dimension
        embeddings_tensor = embeddings_tensor.to(device)

        
        doc_encodings_batch = document_model.encode_single_doc(embeddings_tensor)
        all_doc_encodings.extend(doc_encodings_batch.detach().cpu().numpy())
    
    final_embeddings = np.array(all_doc_encodings)
    print(final_embeddings.shape)
    return final_embeddings


device cuda


In [9]:
import numpy as np
from data.marco import get_all_documents

documents = (get_all_documents())
flattened_documents = [item for sublist in documents for item in sublist]
print(f"Number of documents: {len(flattened_documents)}")

docs_embedding = get_docs_embedding(flattened_documents)

# Save to disk
np.save("docs_embeddings_model.npy", docs_embedding)
print(f"Number of docs_embedding: {len(docs_embedding)}")

Number of documents: 676193
(676193, 128)
Number of docs_embedding: 676193


In [5]:
from data.marco import get_all_documents
documents = (get_all_documents())
flattened_documents = [item for sublist in documents for item in sublist]

flattened_documents[0]

"Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site."

In [7]:
import faiss
import numpy as np


docs_embedding = np.load("docs_embeddings_model.npy", mmap_mode='r')

print(f"Number of docs_embedding: {len(docs_embedding)}")


dimension = docs_embedding.shape[1]
print('dimension', dimension)
index = faiss.IndexFlatL2(dimension)
print('index')

chunk_size = 5000  # Adjust the chunk size based on your available system memory
save_interval = 500  # Save the index every 500 chunks

for i in range(0, len(docs_embedding), chunk_size):
    print("Adding chunk i:", i)
    # Since docs_embedding is memory-mapped, slicing it does not load the entire array into memory
    chunk = docs_embedding[i:i + chunk_size]
    index.add(chunk)  # Add chunks incrementally to the FAISS index

    if (i // chunk_size + 1) % save_interval == 0:
        # Save the index to disk
        faiss.write_index(index, f"temp_index_{i // chunk_size + 1}.index")
        print(f"Saved index at chunk {i // chunk_size + 1}")

faiss.write_index(index, f"temp_index_test.index")
print('Index added')

Number of docs_embedding: 676193
dimension 128
index
Adding chunk i: 0
Adding chunk i: 5000
Adding chunk i: 10000
Adding chunk i: 15000
Adding chunk i: 20000
Adding chunk i: 25000
Adding chunk i: 30000
Adding chunk i: 35000
Adding chunk i: 40000
Adding chunk i: 45000
Adding chunk i: 50000
Adding chunk i: 55000
Adding chunk i: 60000
Adding chunk i: 65000
Adding chunk i: 70000
Adding chunk i: 75000
Adding chunk i: 80000
Adding chunk i: 85000
Adding chunk i: 90000
Adding chunk i: 95000
Adding chunk i: 100000
Adding chunk i: 105000
Adding chunk i: 110000
Adding chunk i: 115000
Adding chunk i: 120000
Adding chunk i: 125000
Adding chunk i: 130000
Adding chunk i: 135000
Adding chunk i: 140000
Adding chunk i: 145000
Adding chunk i: 150000
Adding chunk i: 155000
Adding chunk i: 160000
Adding chunk i: 165000
Adding chunk i: 170000
Adding chunk i: 175000
Adding chunk i: 180000
Adding chunk i: 185000
Adding chunk i: 190000
Adding chunk i: 195000
Adding chunk i: 200000
Adding chunk i: 205000
Adding

In [15]:
import faiss

faiss.omp_set_num_threads(1)

index = faiss.read_index("temp_index_test.index", faiss.IO_FLAG_ONDISK_SAME_DIR)
print("Index loaded")
# Search the index for the top k most similar documents
k = 5  # Number of nearest neighbors to retrieve

# unsqueeze it to add an extra dimension, making it a 2D tensor with shape (1, 128)
query = "how long is german measles contagious?"
query_emb = dataset_instance.get_query_embedding(query) 
query_emb = query_model(query_emb)
query_emb_2d = query_emb.unsqueeze(0).detach().numpy()
print(query_emb.shape)

D, I = index.search(query_emb_2d, k)  # D: distances, I: indices of the neighbors

print("Indices of nearest neighbors:", I)
print("Distances to nearest neighbors:", D)

Index loaded
torch.Size([128])
Indices of nearest neighbors: [[108595 540659  45617 353242 466727]]
Distances to nearest neighbors: [[10.550879 10.625858 10.625963 10.626002 10.626014]]


In [16]:
top5_documents = I.flatten()
docs_embedding = np.load("docs_embeddings_model.npy", mmap_mode='r')


for idx in top5_documents:
    print(flattened_documents[idx])



Lenders are in charge of setting their own guidelines in reference to how long a pre-approval is valid. Some lenders will set a guideline of 45 days, while others will go as long as 90 days. However, should a pre-approval letter expire because the borrower hasn't found the right house, he can obtain a new pre-approval letter once the lender has re-verified his income, debt and credit situation. Mortgage pre-approvals are letters issued by a bank after a consumer's income, debt and credit history have been reviewed. These letters provide a statement showing the total amount of a loan that a consumer is approved to obtain in a home purchase.
FHA Mortgage Job Requirements. First, the lender must look at the last two full years of employment. For this reason it would be smart to have tax returns available for the last three years. Second, HUD not only wants to know that you now have a job, it also wants some sense that FHA borrowers will have a job in the future. 
FHA Upfront Mortgage Insu

In [81]:
flattened_documents[8]

'A rebuildable atomizer (RBA), often referred to as simply a “rebuildable,” is just a special type of atomizer used in the Vape Pen and Mod Industry that connects to a personal vaporizer. 1 The bottom feed RBA is, perhaps, the easiest of all RBA types to build, maintain, and use. 2  It is filled from below, much like bottom coil clearomizer. 3  Bottom feed RBAs can utilize cotton instead of silica for the wick. 4  The Genesis, or genny, is a top feed RBA that utilizes a short woven mesh wire.'