In [None]:
#!pip install -q langchain langchain-community chromadb sentence-transformers openai pypdf

!pip install -q --upgrade langchain langchain-community chromadb sentence-transformers openai "requests<=2.32.4"

In [None]:
import os
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
#from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

In [None]:
from google.colab import files
uploaded = files.upload()

Saving population_state.csv to population_state (8).csv


In [None]:
#!pip install -q langchain langchain-community chromadb sentence-transformers openai


In [None]:
import pandas as pd

csv_path = "population_state.csv"
df = pd.read_csv(csv_path)

df.head()


Unnamed: 0,state,date,sex,age,ethnicity,population
0,Johor,1/1/2020,female,0-4,bumi_malay,97.9
1,Johor,1/1/2020,female,0-4,chinese,42.3
2,Johor,1/1/2020,female,0-4,indian,9.5
3,Johor,1/1/2020,female,0-4,other_citizen,0.6
4,Johor,1/1/2020,female,0-4,other_noncitizen,0.9


##Create a “text” column from your dataset

In [None]:
df["text"] = df.apply(
    lambda r: f"In {r['state']} on {r['date']}, the {r['sex']} population aged {r['age']} "
              f"with ethnicity {r['ethnicity']} is {r['population']}.",
    axis=1
)


In [None]:
df

Unnamed: 0,state,date,sex,age,ethnicity,population,text
0,Johor,1/1/2020,female,0-4,bumi_malay,97.9,"In Johor on 1/1/2020, the female population ag..."
1,Johor,1/1/2020,female,0-4,chinese,42.3,"In Johor on 1/1/2020, the female population ag..."
2,Johor,1/1/2020,female,0-4,indian,9.5,"In Johor on 1/1/2020, the female population ag..."
3,Johor,1/1/2020,female,0-4,other_citizen,0.6,"In Johor on 1/1/2020, the female population ag..."
4,Johor,1/1/2020,female,0-4,other_noncitizen,0.9,"In Johor on 1/1/2020, the female population ag..."
...,...,...,...,...,...,...,...
15355,W.P. Putrajaya,1/1/2025,male,85+,bumi_malay,0.0,"In W.P. Putrajaya on 1/1/2025, the male popula..."
15356,W.P. Putrajaya,1/1/2025,male,85+,chinese,0.0,"In W.P. Putrajaya on 1/1/2025, the male popula..."
15357,W.P. Putrajaya,1/1/2025,male,85+,indian,0.0,"In W.P. Putrajaya on 1/1/2025, the male popula..."
15358,W.P. Putrajaya,1/1/2025,male,85+,other_citizen,0.0,"In W.P. Putrajaya on 1/1/2025, the male popula..."


##to apply the LangChain Document loader

In [None]:
from langchain.schema import Document

TEXT_COL = "text"

df = df.dropna(subset=[TEXT_COL])

documents = []
for _, row in df.iterrows():
    content = row[TEXT_COL]
    metadata = {
        "state": row["state"],
        "date": row["date"],
        "sex": row["sex"],
        "age": row["age"],
        "ethnicity": row["ethnicity"],
        "population": row["population"]
    }
    documents.append(Document(page_content=content, metadata=metadata))

len(documents)


15360

##Split into chunks

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

docs = text_splitter.split_documents(documents)
print(f"Created {len(docs)} chunks")


Created 16328 chunks


##Create embeddings + Chroma vector store

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory="/content/chroma_myeg_csv"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [None]:

num_vectors = vectorstore._collection.count()
print("Number of vectors in Chroma:", num_vectors)


Number of vectors in Chroma: 32656


In [None]:

data = vectorstore._collection.get(
    limit=3,
    include=["embeddings", "metadatas", "documents"]
)

embs = data["embeddings"]
docs_text = data["documents"]
metas = data["metadatas"]

print("Number of returned embeddings:", len(embs))
print("Dimension of each embedding:", len(embs[0]))
print("First 5 values of first embedding:", embs[0][:5])

print("\nSample doc text:", docs_text[0])
print("Sample metadata:", metas[0])


Number of returned embeddings: 3
Dimension of each embedding: 384
First 5 values of first embedding: [ 0.04103943  0.06359239  0.06796595 -0.00530133 -0.0693754 ]

Sample doc text: In Johor on 1/1/2020, the female population aged 0-4 with ethnicity bumi_malay is 97.9.
Sample metadata: {'ethnicity': 'bumi_malay', 'age': '0-4', 'date': '1/1/2020', 'sex': 'female', 'state': 'Johor', 'population': 97.9}


In [None]:
sample_vec = embeddings.embed_query("test sentence about Johor population")
print("Embedding dimension:", len(sample_vec))
print("First 5 values:", sample_vec[:5])


Embedding dimension: 384
First 5 values: [0.008339829742908478, 0.1021864265203476, 0.030118180438876152, 0.005961963441222906, -0.030810521915555]


In [None]:
query = "female population age 0-4 in Johor"
results = retriever.get_relevant_documents(query)

print(f"Top {len(results)} results:")
for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print("Text:", doc.page_content)
    print("Metadata:", doc.metadata)


Top 5 results:

Result 1:
Text: In Johor on 1/1/2021, the female population aged 0-4 with ethnicity indian is 8.9.
Metadata: {'ethnicity': 'indian', 'age': '0-4', 'population': 8.9, 'date': '1/1/2021', 'state': 'Johor', 'sex': 'female'}

Result 2:
Text: In Johor on 1/1/2021, the female population aged 0-4 with ethnicity indian is 8.9.
Metadata: {'age': '0-4', 'ethnicity': 'indian', 'sex': 'female', 'state': 'Johor', 'population': 8.9, 'date': '1/1/2021'}

Result 3:
Text: In Johor on 1/1/2020, the female population aged 0-4 with ethnicity indian is 9.5.
Metadata: {'date': '1/1/2020', 'age': '0-4', 'ethnicity': 'indian', 'state': 'Johor', 'sex': 'female', 'population': 9.5}

Result 4:
Text: In Johor on 1/1/2020, the female population aged 0-4 with ethnicity indian is 9.5.
Metadata: {'age': '0-4', 'date': '1/1/2020', 'ethnicity': 'indian', 'state': 'Johor', 'sex': 'female', 'population': 9.5}

Result 5:
Text: In Johor on 1/1/2023, the female population aged 35-39 with ethnicity indian is 

In [None]:
query = "female population age 0-4 in Johor"

results = vectorstore.similarity_search_with_score(query, k=10)

for i, (doc, score) in enumerate(results, 1):
    print(f"\nResult {i} | score={score:.4f}")
    print("Text:", doc.page_content)
    print("Metadata:", doc.metadata)



Result 1 | score=0.1798
Text: In Johor on 1/1/2021, the female population aged 0-4 with ethnicity indian is 8.9.
Metadata: {'sex': 'female', 'population': 8.9, 'ethnicity': 'indian', 'date': '1/1/2021', 'age': '0-4', 'state': 'Johor'}

Result 2 | score=0.1798
Text: In Johor on 1/1/2021, the female population aged 0-4 with ethnicity indian is 8.9.
Metadata: {'date': '1/1/2021', 'ethnicity': 'indian', 'state': 'Johor', 'sex': 'female', 'population': 8.9, 'age': '0-4'}

Result 3 | score=0.1879
Text: In Johor on 1/1/2020, the female population aged 0-4 with ethnicity indian is 9.5.
Metadata: {'population': 9.5, 'sex': 'female', 'date': '1/1/2020', 'state': 'Johor', 'ethnicity': 'indian', 'age': '0-4'}

Result 4 | score=0.1879
Text: In Johor on 1/1/2020, the female population aged 0-4 with ethnicity indian is 9.5.
Metadata: {'population': 9.5, 'age': '0-4', 'date': '1/1/2020', 'sex': 'female', 'ethnicity': 'indian', 'state': 'Johor'}

Result 5 | score=0.1966
Text: In Johor on 1/1/2023, the

In [None]:
query = "female population age 0-4 in Johor"

results = vectorstore.similarity_search(
    query,
    k=5,
    filter={"ethnicity": "chinese"}
)

for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print("Text:", doc.page_content)
    print("Metadata:", doc.metadata)



Result 1:
Text: In Johor on 1/1/2022, the female population aged 0-4 with ethnicity chinese is 31.7.
Metadata: {'population': 31.7, 'date': '1/1/2022', 'age': '0-4', 'ethnicity': 'chinese', 'state': 'Johor', 'sex': 'female'}

Result 2:
Text: In Johor on 1/1/2022, the female population aged 0-4 with ethnicity chinese is 31.7.
Metadata: {'population': 31.7, 'ethnicity': 'chinese', 'age': '0-4', 'date': '1/1/2022', 'sex': 'female', 'state': 'Johor'}

Result 3:
Text: In Johor on 1/1/2023, the female population aged 0-4 with ethnicity chinese is 25.7.
Metadata: {'population': 25.7, 'date': '1/1/2023', 'sex': 'female', 'state': 'Johor', 'ethnicity': 'chinese', 'age': '0-4'}

Result 4:
Text: In Johor on 1/1/2023, the female population aged 0-4 with ethnicity chinese is 25.7.
Metadata: {'state': 'Johor', 'date': '1/1/2023', 'sex': 'female', 'ethnicity': 'chinese', 'population': 25.7, 'age': '0-4'}

Result 5:
Text: In Johor on 1/1/2024, the female population aged 0-4 with ethnicity chinese is 

In [None]:
#using free LLM

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

hf_pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


LLM ready ✅


In [None]:
def rag_answer(query, top_k: int = 5):
    #To retrieve relevant chunks from Chroma
    docs = retriever.get_relevant_documents(query)[:top_k]
    context = "\n".join(d.page_content for d in docs)

    #To build a prompt for the LLM
    prompt = f"""
You are a helpful assistant answering questions about Malaysian population statistics.

Use ONLY the information in the context.
If the answer is not in the context, say you don't know.

Context:
{context}

Question: {query}

Answer in a short, clear paragraph.
"""

    #To call the LLM
    out = hf_pipe(prompt)[0]["generated_text"]

    return out, docs


In [None]:
answer, sources = rag_answer("What is the female population aged 0-4 in Johor in 2020?")
print("Answer:", answer)
print("\nSources metadata:")
for d in sources:
    print(d.metadata)


Answer: 97.9

Sources metadata:
{'age': '0-4', 'state': 'Johor', 'population': 7.2, 'date': '1/1/2023', 'ethnicity': 'indian', 'sex': 'female'}
{'population': 7.2, 'ethnicity': 'indian', 'state': 'Johor', 'sex': 'female', 'age': '0-4', 'date': '1/1/2023'}
{'age': '0-4', 'ethnicity': 'bumi_malay', 'sex': 'female', 'state': 'Johor', 'date': '1/1/2020', 'population': 97.9}
{'age': '0-4', 'ethnicity': 'bumi_malay', 'sex': 'female', 'population': 97.9, 'state': 'Johor', 'date': '1/1/2020'}
{'state': 'Johor', 'age': '0-4', 'date': '1/1/2021', 'ethnicity': 'indian', 'population': 8.9, 'sex': 'female'}


In [None]:
def chat():
    print("Population RAG chatbot. Type 'exit' to quit.\n")
    while True:
        q = input("You: ")
        if q.lower() in ["exit", "quit"]:
            break
        answer, _ = rag_answer(q)
        print("\nBot:", answer, "\n")

chat()


Population RAG chatbot. Type 'exit' to quit.

You: most population or religion in selangor 

Bot: In Selangor on 1/1/2021, the female population aged 45-49 with ethnicity indian is 22.7. In Selangor on 1/1/2021, the female population aged 45-49 with ethnicity indian is 22.7. In Selangor on 1/1/2021, the female population aged 35-39 with ethnicity indian is 33.6. In Selangor on 1/1/2021, the female population aged 35-39 with ethnicity indian is 33.6. 

You: exit


In [None]:
import gradio as gr

def gradio_chat(message, history):
    answer, _ = rag_answer(message)
    return answer

demo = gr.ChatInterface(
    fn=gradio_chat,
    title="🇲🇾 Population RAG Chatbot",
    description="Ask about population by state / sex / age / ethnicity."
)

demo.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()


--------


Running on public URL: https://9d74adee0ecfbaa228.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


