In [1]:
import wikipedia
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def get_contents(topic):
    try:
        page = wikipedia.page(topic)
        return page.content
    except wikipedia.exceptions.PageError:
        return None
    except wikipedia.exceptions.DisambiguationError as e:
        print(f'Topic provided is ambigious, please select other specific options :  {e.options}')
        return None

topic = input('Enter a topic to research')

document = get_contents(topic)

if not document:
    print('No result to publish')
    exit()

Enter a topic to research mango plant


In [4]:
# load tokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")

def create_chunks(document, chunk_size= 256, chunk_overlap=20):
    tokens = tokenizer.tokenize(document)
    chunks=[]
    start=0
    while start < len(tokens):
        end= min(start+chunk_size, len(tokens))
        chunks.append(tokenizer.convert_tokens_to_string(tokens[start:end]))
        if end == len(tokens):
            break
        start= end - chunk_overlap
    return chunks

chuncks= create_chunks(document)
print(f'Number of chunks created {len(chuncks)}')

Token indices sequence length is longer than the specified maximum sequence length for this model (4366 > 512). Running this sequence through the model will result in indexing errors


Number of chunks created 19


In [5]:
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = embedding_model.encode(chuncks)

In [6]:
print(embeddings)

[[ 0.04770076  0.01964167 -0.01560034 ...  0.00756312  0.02471893
  -0.02613712]
 [ 0.03630906  0.01245931 -0.01416473 ...  0.00426883  0.01575447
  -0.04536134]
 [ 0.03126928  0.01449801 -0.01261422 ...  0.02213684  0.01304369
  -0.03824551]
 ...
 [ 0.00998881  0.02058469 -0.02952743 ...  0.05728513 -0.02873346
  -0.05945873]
 [ 0.05490751  0.00127059 -0.0093817  ... -0.01307335 -0.00080198
  -0.05304766]
 [ 0.02953332  0.03127516 -0.01317424 ...  0.02744339  0.01183814
  -0.03318902]]


In [7]:
embeddings.shape

(19, 768)

In [None]:
# (47, 768)
# number of chunks = 47,
# number of vectores in each chunk = 768

In [8]:
num_vec= embeddings.shape[1]
index = faiss.IndexFlatL2(num_vec)
index.add(np.array(embeddings))

In [9]:
query = input('Ask a question based on topic')
query_embedding = embedding_model.encode([query])

k=3
distances, indices = index.search(np.array(query_embedding),k)

Ask a question based on topic different name for mango


In [10]:
distances

array([[0.47468457, 0.5476579 , 0.6402631 ]], dtype=float32)

In [11]:
indices

array([[ 0, 17,  7]], dtype=int64)

In [12]:
indices[0]

array([ 0, 17,  7], dtype=int64)

In [13]:
relevant_chunks = [chuncks[i] for i in indices[0]]

In [14]:
for ch in relevant_chunks:
    print('---\n' + ch)

---
a mango is an edible stone fruit produced by the tropical tree mangifera indica. it originated from the region between northwestern myanmar, bangladesh, and northeastern india. m. indica has been cultivated in south and southeast asia since ancient times resulting in two types of modern mango cultivars : the " indian type " and the " southeast asian type ". other species in the genus mangifera also produce edible fruits that are also called " mangoes ", the majority of which are found in the malesian ecoregion. worldwide, there are several hundred cultivars of mango. depending on the cultivar, mango fruit varies in size, shape, sweetness, skin color, and flesh color, which may be pale yellow, gold, green, or orange. mango is the national fruit of india, pakistan and the philippines, while the mango tree is the national tree of bangladesh. = = etymology = = the english word mango ( plural mangoes or mangos ) originated in the 16th century from the portuguese word manga, from the mal

In [15]:
ans_model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(ans_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(ans_model_name)
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)


#converting chunk list into text

context = ''.join(relevant_chunks)
answer = qa_pipeline(question=query, context=context)
print(answer)

{'score': 0.010328400880098343, 'start': 1116, 'end': 1131, 'answer': 'malayalam manna'}


In [16]:
answer['answer']

'malayalam manna'