In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
!pip install sentence-transformers emoji

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [3]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [4]:
!pip install evaluate jiwer rouge_score

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [5]:
from sentence_transformers import SentenceTransformer

m2 = SentenceTransformer('sentence-transformers/LaBSE')

In [7]:
import openai

openai.api_key = <openai-key>

### Dataset Cleaning and Preprocessing Steps

In [8]:
with open('dataset.txt', 'r') as f:
    data = f.read()
    
data = data.replace('\n\n','\n')

In [9]:
data = data.split('---')

for i in range(0, len(data)):
    if i==4:
        data[i] = data[i].replace('\n**', '\n###').replace('**','')
    elif i==3:
        data[i] = data[i].replace('**','')
    else:
        data[i] = data[i].replace('**','')

In [10]:
ques_ans = dict()
for i in range(0, len(data)):
    temp = data[i]
    temp = temp.split('\n###')
    
    for j in range(1, len(temp)):
        tp = temp[j].split('\n')
        ques_ans[tp[0]] = " ".join(tp[1:])

In [11]:
import pandas as pd

qa_dict = dict()
qa_dict['title'] = []
qa_dict['text'] = []

for key, value in ques_ans.items():
    qa_dict['title'].append(key)
    qa_dict['text'].append(value)
    
qa_df = pd.DataFrame.from_dict(qa_dict)

In [12]:
from datasets import Dataset

# You can load a Dataset object this way
dataset = Dataset.from_pandas(qa_df)

dataset

Dataset({
    features: ['title', 'text'],
    num_rows: 50
})

In [13]:
import re
import pandas as pd
import emoji
def get_re_compiler_list_words(words_list):
    compiled_str = r''
    for i, word in enumerate(words_list):
        if i == len(words_list) - 1:
            compiled_str = compiled_str + r'\b' + word + r'\b'
        else:
            compiled_str = compiled_str + r'\b' + word + r'\b | '
    r = re.compile(compiled_str, flags=re.I | re.X)

    return r


def get_compiler_removed_stopwords():
    stopword_list = ['follow', 'channel', 'news', 'source', 'breaking', 'tv', 'watch', 'video', 'support', 'subscribe',
                     'share', 'link', 'comment', 'download', 'free', 'post', 'click', 'online', 'tube', 'call', 'plz',
                     'donate', 'help', 'shared', 'pls', 'sms', 'likes', 'copy', 'following', 'retweet', 'website',
                     'comments', 'notification', 'updates', 'play', 'pay', 'msg', 'quotes', 'block', 'posts', 'rekoooo',
                     'sent', 'from', 'here', 'android']
    r = get_re_compiler_list_words(stopword_list)

    return r

stop_words_regex = get_compiler_removed_stopwords()
url_pattern = re.compile(r'(https?://[^\s]+)')
text_compile1 = re.compile("&#39;")
text_compile2 = re.compile('[-_[\]{}~\':;"’‘()–\n\r<>@&*+!?•°.,\\\/%=^$|#“”]+')
ptrn = re.compile('[^a-z ]')

def get_emoji_regexp():
    # Sort emoji by length to make sure multi-character emojis are
    # matched first
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

def remove_emoji(text):
    outp = get_emoji_regexp().sub(u'', str(text))
    return outp

def remove_mentions_and_hashtag(text):
    
    outp = " ".join(filter(lambda x: x[0] not in ['#', '@'], text.split()))
    return outp

def remove_unnecessary_text(example):
    text = example['title']
    outp = url_pattern.sub('', text)
    outp = stop_words_regex.sub('', outp)
    outp = remove_mentions_and_hashtag(remove_emoji(outp))
    outp = text_compile1.sub('', outp)
    outp = text_compile2.sub('', outp)
    
    example['clean_title'] = outp

    return example

In [14]:
import numpy as np
def sent_check2(example):
    
    output = np.zeros([768,], dtype=np.float32)
    try:
        sent_embed = m2.encode(example['clean_title'])
        output = sent_embed
    except:
        print("Error")
        
    example['embeddings'] = output
    
    return example

In [15]:
dataset = dataset.map(remove_unnecessary_text)
dataset = dataset.map(sent_check2)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [16]:
# adding FAISS Index for Retrieval via Embeddings
dataset.add_faiss_index(column='embeddings')

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'text', 'clean_title', 'embeddings'],
    num_rows: 50
})

In [17]:
# Language Detection Model from NLLB

import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)



In [18]:
question = "How can an NRI apply for a new PAN card?"

question_embedding = m2.encode(question)
scores, retrieved_examples = dataset.get_nearest_examples('embeddings', question_embedding, k=10)

In [19]:
model.predict(question)[0][0].split('__')[-1]

'eng_Latn'

In [20]:
language = model.predict(question)[0][0].split('__')[-1]
updated_message = [
        {"role": "system", "content": "I want you to act as a question answering bot which uses the context mentioned and answer in a concise manner and doesn't make stuff up."
                                      "You will answer question based on the context - {}"
                                      "You will create content in {} language"},
        {"role": "user", "content": "Now I want you to answer this question {}."},
    ]

updated_message[0]['content'] = updated_message[0]['content'].format(retrieved_examples['text'], language)
updated_message[1]['content'] = updated_message[1]['content'].format(question)

completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=updated_message,
                                                      timeout=240, max_tokens=400, n=1, stop=None, temperature=0)

In [21]:
completion['choices'][0]['message']['content']

'To apply for a new PAN card as an NRI, follow these steps:\n1. Visit the ABC app.\n2. Navigate to Services > NRI Pan Card > Apply New PAN.\n3. Select the required form of PAN card and proceed with the payment.\n4. Our team will contact you to request the necessary documents, including your passport (any country) or OCI card, passport size photograph, and overseas address proof with zip code (such as Indian NRO/NRE account statement, overseas bank statement, or utility bill).'

In [22]:
from evaluate import load
wer = load("rouge")
wer_score = wer.compute(predictions=['To apply for a new PAN card as an NRI, follow these steps:\n1. Visit the ABC app.\n2. Navigate to Services > NRI Pan Card > Apply New PAN.\n3. Select the required form of PAN card and proceed with the payment.\n4. Our team will contact you to request the necessary documents, including your passport (any country) or OCI card, passport size photograph, and overseas address proof with zip code.\n5. Submit the requested documents to our team.\n6. Your PAN card application will be processed, and the card will be delivered to your overseas address, including your Canadian address if applicable.'], references=["""Here are the steps for *PAN CARD* processing. 

- Visit SBNRI app
- Navigate to Services > NRI Pan Card > Apply New PAN
- Select the required form of PAN card and proceed with the payment
- Our team will get in touch with you to ask for the following documents:
    - Passport(Any Country) / OCI Card
    - Passport Size Photograph
    - Overseas address proof with zip code (Supporting documents - Indian NRO/NRE Account statement or Overseas bank statement or Utility bill)"""])

In [23]:
wer_score

{'rouge1': 0.5909090909090909,
 'rouge2': 0.3908045977011495,
 'rougeL': 0.5568181818181819,
 'rougeLsum': 0.5568181818181819}

In [24]:
# Evaluating the generated text with Word Error Rate (used as for transcription) and ROUGE Score for Translation Tasks.
wer = load("wer")
rouge = load("rouge")

## Wrapping the Procedure in a function for Inference

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_output(example):
    
    question = example['Question']

    question_embedding = m2.encode(question)
    scores, retrieved_examples = dataset.get_nearest_examples('embeddings', question_embedding, k=10)
    
    language = model.predict(question)[0][0].split('__')[-1]
    updated_message = [
            {"role": "system", "content": "I want you to act as a question answering bot which uses the context mentioned and answer in a concise manner and doesn't make stuff up."
                                          "You will answer question based on the context - {}"
                                          "You will create content in {} language"},
            {"role": "user", "content": "Now I want you to answer this question {}."},
        ]

    updated_message[0]['content'] = updated_message[0]['content'].format(retrieved_examples['text'], language)
    updated_message[1]['content'] = updated_message[1]['content'].format(question)

    completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=updated_message,
                                                          timeout=240, max_tokens=400, n=1, stop=None, temperature=0)
    example['retrieve_answer'] = completion['choices'][0]['message']['content']
    
    rg_score = rouge.compute(predictions=[example['retrieve_answer']], references=[example['Ideal Answer']])
    example['rouge1'] = rg_score['rouge1']
    example['rouge2'] = rg_score['rouge2']
    example['rougeL'] = rg_score['rougeL']
    example['rougeLsum'] = rg_score['rougeLsum']
    
    example['wer_score'] = wer.compute(predictions=[example['retrieve_answer']], references=[example['Ideal Answer']])
    
    gold_answer = m2.encode(example['Ideal Answer'])
    predicted_answer = m2.encode(example['retrieve_answer'])
    
    example['cosine_similarity'] = cosine_similarity([gold_answer, predicted_answer])[0][0]
    
    return example

In [26]:
from datasets import load_dataset

test_dict = load_dataset('csv', data_files='test_data.csv')



  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
test_dict

DatasetDict({
    train: Dataset({
        features: ['Question', 'Ideal Answer', 'retrieve_answer', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'wer_score', 'cosine_similarity'],
        num_rows: 34
    })
})

In [29]:
test_dict = test_dict.map(retrieve_output)

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [31]:
test_dict

DatasetDict({
    train: Dataset({
        features: ['Question', 'Ideal Answer', 'retrieve_answer', 'rouge1', 'rouge2', 'rougeL', 'rougeLsum', 'wer_score', 'cosine_similarity'],
        num_rows: 34
    })
})

## Evaluation Metrics

### Average Cosine Similarity

In [32]:
# Higher Cosine Similarity shows better retrieved results based on semantics of the content.
np.average(test_dict['train']['cosine_similarity'])

1.0000000017530777

### Average WER Score

In [33]:
# Word by Word Matching of results is not good enough
np.average(test_dict['train']['wer_score'])

0.7302871304448888

### Average Rouge Scores

In [34]:
# ROUGE-N (N-gram) scoring - 1-gram
np.average(test_dict['train']['rouge1'])

0.6106952600891115

In [35]:
# ROUGE-N (N-gram) scoring - 2-gram
np.average(test_dict['train']['rouge2'])

0.4811727442217518

In [36]:
# ROUGE-L (Longest Common Subsequence) scoring - Sentence Level
np.average(test_dict['train']['rougeL'])

0.5594129935022545

In [37]:
# ROUGE-L (Longest Common Subsequence) scoring - Summary Level
np.average(test_dict['train']['rougeLsum'])

0.5823378441220609