# Get content

In [16]:
import wikipedia
content = wikipedia.page("Natural_language_processing").content
content



# Cleaning/Preprocessing

In [17]:
import re

def clean_text(text):
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

cleaned_data = clean_text(content)
print("Cleaned text:\n", cleaned_data[:500])


Cleaned text:
 Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics. Typically data is collected in text corpora, using either rule-based, statistical or neural-based approaches in machine learning and deep learn


In [18]:
len(cleaned_data)

30954

# Knowledge base using sentence embenddings

In [15]:
from sentence_transformers import SentenceTransformer
import faiss # Facebook AI Similarity Seacrh
import numpy as np

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
chunks = [cleaned_data[i:i+500] for i in range(0, len(cleaned_data), 500)] # chunking
embeddings = model.encode(chunks) # embeddings for each chunk
# embeddings array of the form (N,D) where N = no. of chunks and D = dim of embeddings vec


dimension = embeddings.shape[1] # extract D
index = faiss.IndexFlatL2(dimension) # initialize L2 eucidean distance for similarity b/w vecs; saying it has dimension number of dimensions for the vectors
index.add(np.array(embeddings)) # add embeddings to FAISS index; FAISS build internal struct for optimal search
print(f"Indexed {len(chunks)} chunks.")


Indexed 62 chunks.


# Basic ver

In [19]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", device=0)

In [21]:
def find_closest_chunk(question):
    question_embedding = model.encode([question])
    _, closest_index = index.search(np.array(question_embedding), 1)
    return chunks[closest_index[0][0]]

def answer_question(question):
    context = find_closest_chunk(question)
    result = qa_pipeline(question=question, context=context)
    return result['answer']

question = "what is natural language processing?"
answer = answer_question(question)
print(f"Question: {question}\nAnswer: {answer}")


Question: what is natural language processing?
Answer: providing computers with the ability to process data encoded in natural language


# proper RAG

## Query processing

In [22]:
user_query = "What is Natural Language Processing?"

query_embedding = model.encode([user_query])
D, I = index.search(query_embedding, k=5)  # top 5

relevant_passages = [chunks[i] for i in I[0]]

## Contextual generation

In [23]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

input_text = f"question: {user_query} context: {' '.join(relevant_passages)}"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:  54%|#####4    | 482M/892M [00:00<?, ?B/s]

In [None]:
answer