In [None]:
pip install sentence_transformers openai

In [None]:
pip install docx2txt

In [None]:
pip install pinecone-client

In [72]:
import os
from tqdm.auto import tqdm
from transformers import GPT2TokenizerFast
import openai
from sentence_transformers import SentenceTransformer, util
import docx2txt
import pinecone
os.environ['OPENAI_API_KEY'] = ""
os.environ['PINECONE_API_KEY'] = ""

In [74]:
#Function to split long documents in to smaller parts
def split_text_into_chunks(plain_text, max_chars=2000):
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks

In [75]:
input_text = docx2txt.process('/kaggle/input/query-doc/DataLaw.docx')
#print(my_text)

In [76]:
chunks = split_text_into_chunks(input_text, max_chars=2000)

In [77]:
len(chunks)

29

In [78]:
dim = 1536

# Initialize pinecode index with embedding dimensions from ada model

In [79]:

pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment="us-east1-gcp") #Todo: Initialization of vector database module

# Create index if it wasn't already created

In [80]:
if "legal-data" not in pinecone.list_indexes():
    pinecone.create_index("legal-data", dimension=dim, metric="cosine")

In [81]:
active_indexes = pinecone.list_indexes()

In [82]:
active_indexes

['legal-data']

In [83]:
index = pinecone.Index('legal-data')

# Define embedding using OpenAI text embedding ada 002

In [84]:
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [87]:
def addData(corpusData):
    id  = index.describe_index_stats()['total_vector_count']
    for i in tqdm(range(len(corpusData))):
        chunk=corpusData[i]
        chunkInfo=(str(id+i),
                get_embedding(chunk), #We are using the model to encode the original chunk of text.
                {'context': chunk}) #In metadata we are storing the original text here as context. 
        index.upsert(vectors=[chunkInfo])

# Insert data to pinecone index

In [88]:
addData(chunks)

  0%|          | 0/29 [00:00<?, ?it/s]

In [97]:
#This function is responsible for matching the input string with alread existing data on vector database.

def find_match(query,k):
    query_em = get_embedding(query)
    result = index.query(query_em, top_k=k, includeMetadata=True)
    
    return [result['matches'][i]['metadata']['context'] for i in range(k)]

# Define open ai model params and specifications

In [98]:
openai.api_key = os.environ.get('OPENAI_API_KEY')
COMPLETIONS_MODEL = "text-davinci-003"

In [99]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [100]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

# Prompt engineering to reduce hallucinations by injecting context to queries

In [105]:
def create_prompt(context,query):
    #Todo: Should be generated with the context/contexts we find by doing semantaic search
    header = """Using the provided context, answer the question as truthfully possible and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    return header + "".join(context) + "\n\n Q: " + query + "\n A:"
    
  

In [106]:
def generate_answer(prompt):
    #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.
    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [107]:
def user_query(query):
    #Todo: Make all the things together.
    context = find_match(query, 10)
    generated_prompt = create_prompt(context, query)
    answers = generate_answer(generated_prompt)
    return answers


In [108]:
user_query("how does the land act help me")

'The Land Act helps you by regulating the protection and management of agricultural land by laying down its classification, use and cultivation, agricultural land transactions and lease arrangements, agricultural operations and common pasture. It also helps you by providing funds for drawing up and implementing agricultural land policy measures, and by allowing you to carry out beekeeping, hunting and the recreational gathering of fruits of wild plants, herbaceous wild plants, mushrooms and wild animals on non-arable agricultural land owned or leased by you or otherwise allotted to you.'