In [2]:
import requests
import json
from bs4 import BeautifulSoup

In [3]:
URL = 'https://pratham.org/'

In [4]:
def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    all_links = []
    
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith("/"):
            section_url = f"{url}{href}"
        elif href.startswith('http'):
            section_url = href
        else:
            continue
        all_links.append(section_url)
    return all_links

all_links = get_links(URL)
print(all_links)

['https://pratham.org/', 'https://pratham.org/about/', 'https://pratham.org/about/board/', 'https://pratham.org/about/leadership/', 'https://pratham.org/about/partners/', 'https://pratham.org/about/teaching-at-the-right-level/', 'https://pratham.org/about/recognition/', 'https://pratham.org/about/news/', 'https://pratham.org/about/legal-financial-information/', 'https://pratham.org/programs/education/', 'https://pratham.org/about/hamara-gaon/', 'https://pratham.org/programs/education/early-childhood-education/', 'https://pratham.org/programs/education/elementary/', 'https://pratham.org/slider/second-chance-slider/', 'https://pratham.org/programs/education/digital-initiatives/', 'https://pratham.org/programs/education/vocational-training/', 'https://pratham.org/programs/pratham-council-for-vulnerablechildren/', 'https://pratham.org/programs/education/aser/', 'https://pratham.org/get-involved/job-opportunities/', 'https://pratham.org/get-involved/internships/', 'https://prathamopenschool

In [5]:
all_links = set(all_links)
len(all_links)

49

In [6]:
def extract_single_link(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        content = {
            'headers': [header.get_text(strip=True) for header in soup.find_all(['h1', 'h2', 'h3'])],
            'paragraphs': [para.get_text(strip=True) for para in soup.find_all('p')],
        }
        
        return content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {'headers': [], 'paragraphs': []}
    

def extract_all_links(links):
    all_content = []
    
    for link in links:
        content = extract_single_link(link)
        all_content.append(content)
    
    return all_content


In [7]:
all_content = extract_all_links(all_links)
print(all_content)

Error fetching http://www.pif.org.in: 403 Client Error: ModSecurity Action for url: http://www.pif.org.in/
Error fetching https://www.pratham.org/get-involved/donate: Exceeded 30 redirects.
Error fetching https://prathamopenschool.org/: HTTPSConnectionPool(host='prathamopenschool.org', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)')))
[{'headers': ['Elementary Years', 'METHODOLOGY', 'FEATURED VIDEO', 'IMPLEMENTATION'], 'paragraphs': ['More than 95% of 6-14 year-old children in India are enrolled in schools. However, ASER and other surveys show that a significant proportion of these children complete primary schooling without acquiring foundational reading and arithmetic skills, and therefore, are unable to cope with subsequent curricular expectations. One of Pratham’s key goals is to enable such children acquire basic reading an

In [17]:
# redundant_items = {"About Us", "About Pratham", "Board", "Leadership", "Partners",
#     "Programs", "Education", "Vocational Training", "Annual Status of Education Report", "Pratham Council For Vulnerable Children",
#     "Covid Response", "Learning Readiness & Catch Up","Lessons from the Pandemic", "Get Involved", "Job Opportunities", "Internships"}

def clean_data(all_content):
    for data in all_content:
        data['paragraphs'] = [para for para in data['paragraphs'] if len(para) >= 40]
                
    return all_content

all_content_filtered = clean_data(all_content)

In [21]:
def combine_all_paragraphs(all_content_filtered):
    all_paragraphs = []
    for data in all_content_filtered:
        all_paragraphs.extend(para_list for para_list in data['paragraphs'])
        
    return all_paragraphs

all_paragraphs = combine_all_paragraphs(all_content_filtered)

In [25]:
all_paragraphs

['More than 95% of 6-14 year-old children in India are enrolled in schools. However, ASER and other surveys show that a significant proportion of these children complete primary schooling without acquiring foundational reading and arithmetic skills, and therefore, are unable to cope with subsequent curricular expectations. One of Pratham’s key goals is to enable such children acquire basic reading and arithmetic skills, quickly and sustainably, so that they can meaningfully benefit from further education opportunities.',
 'Pratham has developed theCAMaLpedagogy, which stands forCombined Activities for Maximised Learning, to help children “catch up”. CAMaL is a child-centered pedagogy that builds on the Teaching at the Right Level (TaRL) approach, pioneered by Pratham, that uses instructions and activities tailored and aligned to the learning level of the child.',
 'At Pratham, we believe that a combination of activities helps children engage and learn. For example, reading aloud, parti

In [30]:
!pip install sentence-transformers faiss-cpu

Collecting sentence-transformers
  Using cached sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.8.0.post1 sentence-transformers-3.0.1


In [32]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

all_paragraph_embeddings = model.encode(all_paragraphs)

embedding_matrix = np.array(all_paragraph_embeddings).astype('float32')

index = faiss.IndexFlatL2(embedding_matrix.shape[1])

index.add(embedding_matrix)

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

In [35]:
query = model.encode(['give me proper residential address of pratham'])[0]
D, I = index.search(np.array([query]), k=5)

for idx in I[0]:
    print(all_paragraphs[idx])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

For more information contactinfo@pratham.org
For more information, write to:info@pratham.org
Address:Pratham, House No - 2626, Phase - 1,Urban Estate, Dugri, Ludhiana - 141013, Punjab
Address:Pratham, Hut No 18 Custodian colony,Majeed Bagh, Haiderpur, Srinagar,Jammu & Kashmir, PIN-190005
Address:Pratham, B4/59, 1st Floor,Safdarjung Enclave, New Delhi 110029


In [None]:
def create_sentence_embeddings_with_faiss(all_paragraphs):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    all_paragraph_embeddings = model.encode(all_paragraphs)
    embedding_matrix = np.array(all_paragraph_embeddings).astype('float32')

    index = faiss.IndexFlatL2(embedding_matrix.shape[1])
    index.add(embedding_matrix)
    
    return index

In [47]:
GEMINI_API_KEY = 'AIzaSyBQcURI_vsm3etaHZLVuBO9TTKHUFSyV9o'

In [39]:
import google.generativeai as genai

In [77]:
genai.configure(api_key=GEMINI_API_KEY)
model_gemini = genai.GenerativeModel(model_name='gemini-1.5-flash')
chat = model_gemini.start_chat(history=[])

In [84]:
def generate_response(query):
    query_emb = model.encode([query])[0]
    D, I = index.search(np.array([query_emb]), k=5)
    
    retrieved_info = "\n".join([all_paragraphs[idx] for idx in I[0]])
    prompt = f'''
    User: Act as a bot assistant to Pratham.org. You will be provided with content and a question.
    Based on the content information about Pratham you need to answer the question below:

    Content: {retrieved_info}
    Question: {query}
    
    Model:
    '''
    response = chat.send_message(prompt, stream=True)
    response.resolve()
    return response.text
    

In [85]:
print(generate_response("what is pratham"))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Pratham is a non-governmental organization (NGO) based in India that focuses on improving the quality of education. 

Here's what the content tells us about Pratham:

* **Mission:** To improve the quality of education in India.
* **History:** Founded in 1995 to help children in the slums of Mumbai.
* **Scope:** Grown to reach children and youth across the country.
* **Approach:** Develops high-quality, low-cost, and replicable programs to address gaps in the education system.
* **Partnerships:** Works with state governments on Early Childhood Education initiatives in Punjab, Himachal Pradesh, Andhra Pradesh, and Haryana. 

Pratham is dedicated to making a positive impact on education in India.  



In [2]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.40.0-py3-none-any.whl.metadata (15 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.2.0 (from gradio)
  Downloading gradio_client-1.2.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Downloading python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.5.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit==0.12.0 (from gradio)
  Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)
Collecting typer<1.0,>=0.12 (from gradio)
  Downloading typer-0.12.3-py3-none-any.whl.metadata (15 kB)
Collecting urllib3~=2.0 (from gradio)
  Downloading urllib3-2.2.2-py3-none-any.whl.metadata (6.4 kB)
Collecting shellingham>=

In [3]:
import gradio as gr

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        bot_message = 'wallahi'
        chat_history.append((message, bot_message))
        time.sleep(2)
        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

if __name__ == "__main__":
    demo.launch()

Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Running on public URL: https://e686fd979c49fc017f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
