# Simple Wikipedia based RAG Model

Scrapes Wikipedia for context used to tailor answers by a small LLM. Fit to run on CPUs/mobiles.

Built by Pratham Shah.

In [None]:
import stanza
import requests
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.llms import Ollama

# setup stanza pipeline
stanza.download('en', verbose=False)
nlp = stanza.Pipeline('en', processors='tokenize,ner')


In [None]:
#using POS tagging to identify core title of sentnece
def get_focus(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.type in ('PERSON', 'ORG', 'LOC', 'MISC', 'EVENT', 'WORK_OF_ART')]
    return entities or [text]

#get relevant wikipedia content about that title
def search(query, limit=3):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'list': 'search',
        'srsearch': query,
        'format': 'json',
        'srlimit': limit
    }
    resp = requests.get(url, params=params).json()
    return [item['title'] for item in resp['query']['search']]

#get extracts from that title
def wikipedia_content(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'prop': 'extracts',
        'explaintext': True,
        'titles': title,
        'format': 'json'
    }
    resp = requests.get(url, params=params).json()
    pages = resp['query']['pages']
    page = next(iter(pages.values()))
    return page.get('extract', '')


In [None]:
#gets and splits wikipedia data into chunks
def get_docs_for_query(query):
    entities = get_focus(query)
    docs = []
    splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=300)
    for entity in entities:
        for title in search(entity):
            content = wikipedia_content(title)
            if content:
                for chunk in splitter.split_text(content):
                    docs.append(Document(page_content=chunk, metadata={"source": title}))
    return docs

#FAISS index for each chunk
def build_faiss_index(documents):
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return FAISS.from_documents(documents, embeddings)


In [None]:
#fetches knowledge on each query to update chunks and FAISS
def update_vectorstore(chain, query):
    docs = get_docs_for_query(query)
    if docs:
        vectorstore = build_faiss_index(docs)
        chain.retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    return chain, docs


In [None]:
def input_q(query):
    #get docs from wikipedia
    docs = get_docs_for_query(query)
    
    if not docs:
        print("No documents found.")
        return

    # build FAISS index
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    
    # run model
    llm = Ollama(model="phi3")
    context_docs = retriever.invoke(query)
    context_text = "\n\n".join([doc.page_content for doc in context_docs])

    prompt = f"Answer this question based only on the context below:\n\nContext:\n{context_text}\n\nQuestion: {query}"
    answer = llm.invoke(prompt)

    # output
    print("\n=============== Answer:\n", answer)
    print("=============== Top Sources:")
    for doc in context_docs:
        print(f"- {doc.metadata['source']}")
    print(context_text)


In [12]:
input_q("what is the total land area of New York City?")

Created a chunk of size 3448, which is longer than the specified 2000
Created a chunk of size 2158, which is longer than the specified 2000
Created a chunk of size 2293, which is longer than the specified 2000
Created a chunk of size 2312, which is longer than the specified 2000
Created a chunk of size 2142, which is longer than the specified 2000
Created a chunk of size 3713, which is longer than the specified 2000
Created a chunk of size 2368, which is longer than the specified 2000
Created a chunk of size 2494, which is longer than the specified 2000
Created a chunk of size 2724, which is longer than the specified 2000
Created a chunk of size 2244, which is longer than the specified 2000
Created a chunk of size 2793, which is longer than the specified 2000
Created a chunk of size 2096, which is longer than the specified 2000
Created a chunk of size 2452, which is longer than the specified 2000
Created a chunk of size 2092, which is longer than the specified 2000
Created a chunk of s


 The borough-wide total land area of New York City amounts to approximately 302.6 square miles (784 km²). This includes all five boroughs combined, with each one coextensive with its respective county in terms of administrative boundaries within the city's municipal limits.
- New York City
- Brooklyn
- New York City
New York, often called New York City (NYC), is the most populous city in the United States. It is located at the southern tip of New York State on one of the world's largest natural harbors. The city comprises five boroughs, each coextensive with its respective county. The city is the geographical and demographic center of both the Northeast megalopolis and the New York metropolitan area, the largest metropolitan area in the United States by both population and urban area. New York is a global center of finance and commerce, culture, technology, entertainment and media, academics and scientific output, the arts and fashion, and, as home to the headquarters of the United Na