In [1]:
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer, util
import numpy as np
import sqlite3
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import base64
from transformers import BartForConditionalGeneration, BartTokenizer
import os
import torch
from datetime import datetime

  from tqdm.autonotebook import tqdm, trange
2024-07-15 20:10:57.473465: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-15 20:11:00.989615: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_load_path = 'models/sentence_transformer'
bart_model_load_path = 'models/bart_model'
tokenizer_load_path = 'models/tokenizer'


In [3]:
model = SentenceTransformer(model_load_path).to(device)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_load_path).to(device)
tokenizer = BartTokenizer.from_pretrained(tokenizer_load_path)

In [30]:

def retrieve_embeddings_from_db(conn):
    """Retrieves embeddings and their corresponding chunks for a specific collection name from the SQLite database."""
    c = conn.cursor()
    # c.execute('''
    #     SELECT embeddings.chunk_text, embeddings.embedding
    #     FROM embeddings
    #     INNER JOIN collections
    #     ON embeddings.collection_id = collections.id
    #     WHERE collections.collection_name = ?
    # ''', ("AR22",))

    c.execute(''' SELECT chunk_text,embedding FROM embeddings''')
    rows = c.fetchall()

    chunks = []
    embeddings = []

    for row in rows:
        chunk_text, emb_str = row
        emb = np.frombuffer(base64.b64decode(emb_str), dtype=np.float32)
        chunks.append(chunk_text)
        embeddings.append(emb)

    return chunks, embeddings

def query_with_bart_model_with_one_chunk(chunks, embeddings, query):
    """Generates a query result using the BART model based on the closest chunk embeddings."""
    

    query_embedding = model.encode(query)
    similarities = util.pytorch_cos_sim(query_embedding, embeddings)[0]

    most_similar_idx = np.argmax(similarities)
    most_similar_chunk = chunks[most_similar_idx]

    input_text = f"Question: {query} Context: {most_similar_chunk}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    summary_ids = bart_model.generate(inputs['input_ids'].to('cuda') if torch.cuda.is_available() else inputs['input_ids'],
                                     max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)

    response = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return response

    return summary

def connect_to_database(db_file):
    """Connects to the SQLite database."""
    conn = sqlite3.connect(db_file)
    return conn




In [22]:
def query_with_bart_model_with_three_chucks(chunks, embeddings, query):
    query_embedding = model.encode(query)
    similarities = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    
    # Get the indices of the top 3 most similar chunks
    top_k = 3
    top_k_indices = np.argpartition(-similarities, top_k)[:top_k]

    # Sort the top_k indices by similarity in descending order
    top_k_indices = top_k_indices[np.argsort(-similarities[top_k_indices])]

    # Get the top 3 closest chunks
    closest_chunks = [chunks[idx] for idx in top_k_indices]

    # Construct the input text using the top 3 closest chunks
    input_text = f"Question: {query} Context: {closest_chunks[0]} {closest_chunks[1]} {closest_chunks[2]}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    summary_ids = bart_model.generate(
        inputs['input_ids'].to('cuda') if torch.cuda.is_available() else inputs['input_ids'],
        max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True
    )

    response = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return response


In [23]:
conn = connect_to_database("final.db")


In [31]:
chunks, embeddings = retrieve_embeddings_from_db(conn)
len(chunks)

23842

In [25]:
query_with_bart_model_with_one_chunk(chunks,embeddings,"who is the director of icar creda in 2022?")

'Joint Secretary (Finance), ICAR, Shri G. P. Sharma, visited ICAR-CRIDA, Hyderabad after assuming charge on September 17, 2022. Shri N. V. R. Murthy, Chief Finance & Accounts Officer, ICAR briefed about Shri P. Sharma.'

In [26]:
query_with_bart_model_with_three_chucks(chunks,embeddings,"who is the director of icar creda in 2022?")

'Joint Secretary (Finance), ICAR, Shri G. P. Sharma, visited ICAR-CRIDA, Hyderabad after assuming charge on September 17, 2022. Shri N. V. R. Murthy, Chief Finance & Accounts Officer, ICAR. Dr. M. Maheshwari, Head, Division of Crop Science proposed NRM Division, ICar, Dr. G. Ravindra Chary, Director (Acting) & Project Coordinator, AICRPDA, Y. G Shadakshari, Director of Research, UASB.'