In [None]:
pip install sentence_transformers openai

In [None]:
pip install docx2txt

In [None]:
pip install pinecone-client

In [47]:
import os
from tqdm.auto import tqdm
from transformers import GPT2TokenizerFast
import openai
from sentence_transformers import SentenceTransformer, util
import docx2txt
import pinecone
os.environ['OPENAI_API_KEY'] = ""
os.environ['PINECONE_API_KEY'] = ""

In [48]:
#This is for embedding. In here, one LM model from huggingface used.


model = SentenceTransformer('all-MiniLM-L6-v2')


In [49]:
#Function to split long documents in to smaller parts
def split_text_into_chunks(plain_text, max_chars=2000):
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks

In [50]:

input_text = docx2txt.process('/kaggle/input/query-doc/DataLaw.docx')
#print(my_text)

In [51]:
chunks = split_text_into_chunks(input_text, max_chars=2000)

In [52]:
len(chunks)

29

In [53]:
dim = 384

In [54]:

pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment="us-east1-gcp") #Todo: Initialization of vector database module

In [55]:
if "legal-data" not in pinecone.list_indexes():
    pinecone.create_index("legal-data", dimension=dim, metric="cosine")

In [56]:
active_indexes = pinecone.list_indexes()

In [57]:
active_indexes

['legal-data']

In [58]:
index = pinecone.Index('legal-data')

In [59]:
def addData(corpusData):
    id  = index.describe_index_stats()['total_vector_count']
    for i in tqdm(range(len(corpusData))):
        chunk=corpusData[i]
        chunkInfo=(str(id+i),
                model.encode(chunk).tolist(), #We are using the model to encode the original chunk of text.
                {'context': chunk}) #In metadata we are storing the original text here as context. 
        index.upsert(vectors=[chunkInfo])

# Insert data to pinecone index

In [60]:
addData(chunks)

  0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [61]:
#This function is responsible for matching the input string with alread existing data on vector database.

def find_match(query,k):
    query_em = model.encode(query).tolist()
    result = index.query(query_em, top_k=k, includeMetadata=True)
    
    return [result['matches'][i]['metadata']['context'] for i in range(k)]

In [62]:
openai.api_key = os.environ.get('OPENAI_API_KEY')
COMPLETIONS_MODEL = "text-davinci-003"

In [63]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [64]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [65]:
def create_prompt(context,query):
    #Todo: Should be generated with the context/contexts we find by doing semantaic search
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    return header + "".join(context) + "\n\n Q: " + query + "\n A:"
    
  

In [66]:
def generate_answer(prompt):
    #Todo: Pass the generated prompt and pass it to gpt-3 to get answers.
    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [67]:
def user_query(query):
    #Todo: Make all the things together.
    context = find_match(query, 10)
    generated_prompt = create_prompt(context, query)
    answers = generate_answer(generated_prompt)
    return answers


In [68]:
user_query("how does the land act help me")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'The Land Act helps you by providing regulations for the management of agricultural land in spatial planning procedures, as well as rules for the payment of land use conversion compensation when building on agricultural land. It also provides conditions and criteria that must be met by investors in order to build on agricultural land, and rules for the use of auxiliary agriculture and forest facilities, beehives, and sheds.'