In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import spacy
from transformers import pipeline
import re

base_url = 'https://www.pratham.org/'

sections = {
    'About Us': '/about',
    'Board': 'about/board',
    'Leadership': 'about/leadership',
    'Partners': 'about/partners',
    'Teaching at the right level': 'about/teaching-at-the-right-level/',
    'Recognition': 'about/recognition',
    'News': 'about/news',
    'Humara gaon': 'about/hamara-gaon',
    'Early Childhood Education': 'programs/education/early-childhood-education',
    'Elementary': 'programs/education/elementary',
    'Beyond Elementary': 'programs/education/beyond-elementary',
    'Digital-initiatives': 'programs/education/digital-initiatives',
    'Education': 'programs/education',
    'Vocational Training': 'programs/education/vocational-training',
    'Annual Status of Education Report': 'programs/education/aser',
    'Pratham Council For Vulnerable Children': 'programs/pratham-council-for-vulnerablechildren/',
    'Covid Response': 'covid-19-response/',
    'Learning Readiness & Catch Up': 'covid-19-response/prathams-community-based-campaign-for-learning-readiness-catch-up/',
    'Job Opportunities': 'get-involved/job-opportunities/',
    'Internships': 'get-involved/internships',
}


def scrape_section(section_name, section_url):
    url = base_url + section_url
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = ' '.join([p.get_text(strip=True) for p in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
    return content


data = {section: scrape_section(section, url) for section, url in sections.items()}

contact_info = {
    'Delhi Office': {
        'Address': 'B- 4/59, Safdarjung Enclave, 1st Floor, New Delhi - 110 029',
        'Email': 'info@pratham.org',
        'Contact Number': '01141651638'
    },
    'Mumbai Office': {
        'Address': 'Y.B. Chavan Center, 4th Floor, Gen. J. Bhosale Marg, Nariman Point, Mumbai, Maharashtra - 400021',
        'Email': 'info@pratham.org',
        'Contact Number': '022 22819561, 022 22819562'
    }
}

data.update(contact_info)


df = pd.DataFrame(list(data.items()), columns=['Section', 'Content'])
df.to_csv('pratham_data.csv', index=False)


In [None]:
import pandas as pd
import spacy
from transformers import pipeline
import re
import logging

logging.basicConfig(level=logging.ERROR)

# Load the CSV data
df = pd.read_csv('pratham_data.csv')

# Clean the text data
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Remove leading and trailing whitespace
    return text

df['Content'] = df['Content'].apply(clean_text)

# Initialize the SpaCy model for keyword extraction
nlp = spacy.load('en_core_web_sm')

# Function to extract keywords from text
def extract_keywords(text, num_keywords=10):
    doc = nlp(text)
    keywords = [chunk.text.lower() for chunk in doc.noun_chunks]
    return keywords[:num_keywords]

df['Keywords'] = df['Content'].apply(extract_keywords)

# Initialize the Hugging Face pipeline for text generation
generator = pipeline('text-generation', model='distilgpt2')

# Function to generate a response based on the user's query
def generate_response(query):
    # Find relevant sections based on keyword matching
    relevant_sections = df[df['Keywords'].apply(lambda x: any(word in query.lower() for word in x))]

    if not relevant_sections.empty:
        # Take the first relevant section as context
        context = relevant_sections.iloc[0]['Content']
        max_context_length = 1000
        if len(context) > max_context_length:
            context = context[:max_context_length]
    else:
        context = "Sorry, I don't have information on that topic."

    # Generate response using the transformer model
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    response = generator(prompt, max_new_tokens=150, num_return_sequences=1, temperature=0.2, top_k=50)[0]['generated_text']

    # Extract and clean the response
    response_parts = response.split('Answer:')
    if len(response_parts) > 1:
        response_text = response_parts[1].strip()
    else:
        response_text = response.strip()

    response_text = re.sub(r'\bQuestion:.*$', '', response_text, flags=re.IGNORECASE).strip()

    if "?" in response_text or len(response_text) < 10:
        response_text = "I'm sorry, I don't have a clear answer to that question."

    return response_text

# Function to handle the chatbot interaction
def chat_bot():
    print("Welcome to the Pratham Chatbot! Type 'exit' to end the chat.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            print("Goodbye!")
            break
        response = generate_response(user_input)
        print(f"Bot: {response}")

if __name__ == "__main__":
    chat_bot()


Welcome to the Pratham Chatbot! Type 'exit' to end the chat.
You: What is the mission of Pratham?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot: We are a non-governmental organization that aims to improve the quality of education in India and to improve the quality of education in India. We are a non-governmental organization that aims to improve the quality of education in India and to improve the quality of education in India. We are a non-governmental organization that aims to improve the quality of education in India and to improve the quality of education in India. We are a non-governmental organization that aims to improve the quality of education in India and to improve the quality of education in India. We are a non-governmental organization that aims to improve the quality of education in India and to improve the quality of education in India. We are a non-governmental organization that aims to improve the quality of education
You: What is Humara gaon program by Pratham?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot: Humara gaon is a program that aims to improve the quality of education in India and to improve the quality of education in India. Humara gaon is a program that aims to improve the quality of education in India and to improve the quality of education in India. Humara gaon is a program that aims to improve the quality of education in India and to improve the quality of education in India. Humara gaon is a program that aims to improve the quality of education in India and to improve the quality of education in India. Humara gaon is a program that aims to improve the quality of education in India and to improve the quality of education in India. Humara gaon is a program that aims to improve the quality of education
You: Tell me more about Pratham


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot: I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and I am a member of the Indian National Congress (INPC) and
