In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip install langchain llama-index

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [3]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [4]:
import os
import openai
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

os.environ['OPENAI_API_KEY'] = <openai-key>

In [5]:
from sentence_transformers import SentenceTransformer

m2 = SentenceTransformer('sentence-transformers/LaBSE')

In [6]:
# Language Detection Model from NLLB

import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)



In [7]:
with open('dataset.txt', 'r') as f:
    data = f.read()
    
data = data.replace('\n\n','\n')

In [8]:
data = data.split('---')

for i in range(0, len(data)):
    if i==4:
        data[i] = data[i].replace('\n**', '\n###').replace('**','')
    elif i==3:
        data[i] = data[i].replace('**','')
    else:
        data[i] = data[i].replace('**','')

In [9]:
ques_ans = dict()
for i in range(0, len(data)):
    temp = data[i]
    temp = temp.split('\n###')
    
    for j in range(1, len(temp)):
        tp = temp[j].split('\n')
        ques_ans[tp[0]] = " ".join(tp[1:])

In [10]:
content = str()
for key, value in ques_ans.items():
    tmp = key + ' ' + value
    content = content + tmp + "\n"
    
print(content)

 What is Pan card? The PAN card is a unique ten-digit alphanumeric identification number that is issued by the Income Tax Department of India to track the tax-related transactions of individuals and entities. The PAN card is mandatory for any financial transaction in India, including opening a bank account, buying or selling property, and filing income tax returns.
 Who needs a Pan card? All individuals/non-individuals (including foreign citizens/entities) earning taxable income in India must have a PAN card.
 Types of PAN cards In India, two types of PAN cards are available: e-PAN card and physical PAN card. 1. e-PAN card: An e-PAN card is a digitally-signed PAN card issued in electronic format. It contains the same PAN details as a physical PAN card but is available in a digital format. It can be downloaded online and used as a valid identification document for various purposes. The e-PAN card is usually issued in a PDF format. 2. Physical PAN card: A physical PAN card is a laminated

In [11]:
text_splitter = CharacterTextSplitter(separator='\n', chunk_size=300, chunk_overlap=128, length_function=len)

chunks = text_splitter.split_text(content)

Created a chunk of size 367, which is longer than the specified 300
Created a chunk of size 877, which is longer than the specified 300
Created a chunk of size 529, which is longer than the specified 300
Created a chunk of size 820, which is longer than the specified 300
Created a chunk of size 510, which is longer than the specified 300
Created a chunk of size 397, which is longer than the specified 300
Created a chunk of size 357, which is longer than the specified 300
Created a chunk of size 643, which is longer than the specified 300
Created a chunk of size 361, which is longer than the specified 300
Created a chunk of size 811, which is longer than the specified 300
Created a chunk of size 363, which is longer than the specified 300
Created a chunk of size 491, which is longer than the specified 300
Created a chunk of size 302, which is longer than the specified 300
Created a chunk of size 302, which is longer than the specified 300
Created a chunk of size 955, which is longer tha

In [12]:
embeddings = OpenAIEmbeddings()

vectorStore = FAISS.from_texts(chunks, embeddings)
vectorStore.save_local("faiss_doc_idx")

In [13]:
docs = vectorStore.similarity_search("WHow long does it usually take to receive the PAN card after applying?")

In [14]:
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0)
chain = load_qa_chain(llm, chain_type="refine")

with get_openai_callback() as cb:
    response = chain.run(input_documents=docs, question=chain)

In [15]:
response

"\n\nIf you don't have an Aadhaar card, the PAN card will be issued in 3 weeks after payment is made to ABC. Updation/correction in the PAN card can be done by generating the reissue request for the Updation/ Correction of the PAN CARD through the ABC app. Navigate to Services > NRI PAN Card > PAN Card Correction and request reissue the required PAN card and make the payment. Our team will reachout to you for the required documents. The duration to complete the correction process for your PAN card can vary, but it generally takes around 2-3 weeks. For NRIs, the PAN Aadhaar linking process takes upto 6 to 7 days. Form 49AA is the application form for the allotment of Permanent Account Number for Foreign residents and entities incorporated outside India. Do you want to start the process here instead? Click the button below."

In [16]:
from evaluate import load

# Evaluating the generated text with Word Error Rate (used as for transcription) and ROUGE Score for Translation Tasks.
wer = load("wer")
rouge = load("rouge")

## Wrapping the Procedure in a function for Inference

In [17]:
from datasets import load_dataset

test_dict = load_dataset('csv', data_files='test_data.csv')

Found cached dataset csv (/home/ec2-user/.cache/huggingface/datasets/csv/default-9a0891180e715608/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
test_dict

DatasetDict({
    train: Dataset({
        features: ['Question', 'Ideal Answer'],
        num_rows: 34
    })
})

In [19]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_output(example):
    
    question = example['Question']
    
    language = model.predict(question)[0][0].split('__')[-1]
    template = """I want you to act as a question answering bot which uses the context mentioned and answer in a concise manner and doesn't make stuff up.
            You will answer question based on the context - {context}.
            You will create content in""" + str(language) + """language.
            Question: {question}
            Answer:
            """
    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
    qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorStore.as_retriever(), chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

    result = qa_chain({"query": question})
    example['retrieve_answer'] = result['result']
                                           
    rg_score = rouge.compute(predictions=[example['retrieve_answer']], references=[example['Ideal Answer']])
    example['rouge1'] = rg_score['rouge1']
    example['rouge2'] = rg_score['rouge2']
    example['rougeL'] = rg_score['rougeL']
    example['rougeLsum'] = rg_score['rougeLsum']
    
    example['wer_score'] = wer.compute(predictions=[example['retrieve_answer']], references=[example['Ideal Answer']])
    
    gold_answer = m2.encode(example['Ideal Answer'])
    predicted_answer = m2.encode(example['retrieve_answer'])
    
    example['cosine_similarity'] = cosine_similarity([gold_answer, predicted_answer])[0][0]
    
    return example

In [21]:
test_dict = test_dict.map(retrieve_output)



Map:   0%|          | 0/34 [00:00<?, ? examples/s]



In [22]:
test_dict

DatasetDict({
    train: Dataset({
        features: ['Question', 'Ideal Answer', 'retrieve_answer', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'wer_score', 'cosine_similarity'],
        num_rows: 34
    })
})

## Evaluation Metrics

### Average Cosine Similarity

In [23]:
import numpy as np

# Higher Cosine Similarity shows better retrieved results based on semantics of the content.
np.average(test_dict['train']['cosine_similarity'])

1.0000000017530777

### Average WER Score

In [24]:
# Word by Word Matching of results is not good enough
np.average(test_dict['train']['wer_score'])

0.6036692157657393

### Average Rouge Scores

In [25]:
# ROUGE-N (N-gram) scoring - 1-gram
np.average(test_dict['train']['rouge1'])

0.6546129584515485

In [26]:
# ROUGE-N (N-gram) scoring - 2-gram
np.average(test_dict['train']['rouge2'])

0.529883599668883

In [27]:
# ROUGE-L (Longest Common Subsequence) scoring - Sentence Level
np.average(test_dict['train']['rougeL'])

0.6035913122179466

In [28]:
# ROUGE-L (Longest Common Subsequence) scoring - Summary Level
np.average(test_dict['train']['rougeLsum'])

0.6216551599635091

## Gradio Chatbot

In [32]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage
import openai
import gradio as gr

def predict(message, history):
    history_langchain_format = []
        
    for human, ai in history:
        history_langchain_format.append(HumanMessage(content=human))
        history_langchain_format.append(AIMessage(content=ai))
    
    #history_langchain_format.append(HumanMessage(content=message))
    
    language = model.predict(message)[0][0].split('__')[-1]
    template = """I want you to act as a question answering bot which uses the context mentioned and answer in a concise manner and doesn't make stuff up.
            You will answer question based on the context - {context}.
            You will create content in""" + str(language) + """language.
            Question: {question}
            Answer:
            """
    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
    qa_chain = RetrievalQA.from_chain_type(llm,retriever=vectorStore.as_retriever(), chain_type_kwargs={"prompt": QA_CHAIN_PROMPT})

    result = qa_chain({"query": message})
    
    history_langchain_format.append(HumanMessage(content=message))
    history_langchain_format.append(AIMessage(content=result['result']))
    
    return result['result']

gr.ChatInterface(predict,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Ask me a question related to PAN Services", container=False, scale=7),
    title="DocumentQABot",
    theme="soft",
    examples=["What is the cost/fees of a PAN card?", "How long does it usually take to receive the PAN card after applying?"],
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",).launch(share=True) 

Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://f19daabe3133824e73.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


