In [1]:
import requests
import json
import csv
import time
from pathlib import Path

# Base URL for slug extraction
base_url_slugs = "https://api.myscheme.gov.in/search/v4/schemes"
# Base URL for detailed data
base_url_data = "https://www.myscheme.gov.in/_next/data/Pgr1-v_XYCcKuy3LqoxeR/en/schemes/"

# Headers
headers_slugs = {
    "x-api-key": "tYTy5eEhlu9rFjyxuCr7ra7ACp4dv1RH8gWuHTDc",
    "Accept": "application/json",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
}
headers_data = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Fetch slugs for the first 10 pages
start_from = 0
page_size = 10
max_pages = 100
all_slugs = []

while start_from < (max_pages * page_size):
    params = {
        "lang": "en",
        "q": "[]",
        "keyword": "",
        "sort": "",
        "from": start_from,
        "size": page_size
    }
    url = f"{base_url_slugs}?lang={params['lang']}&q={params['q']}&keyword={params['keyword']}&sort={params['sort']}&from={params['from']}&size={params['size']}"

    response = requests.get(url, headers=headers_slugs)
    if response.status_code == 200:
        data = response.json()
        page_slugs = [item['fields']['slug'] for item in data['data']['hits']['items'] if 'slug' in item['fields']]
        all_slugs.extend(page_slugs)
        print(f"Extracted {len(page_slugs)} slugs from page {start_from // page_size + 1}")
    else:
        print(f"Error fetching slugs: {response.status_code}, {response.text}")
        break
    start_from += page_size
    time.sleep(1)

# List to store structured data
schemes_data = []

# Fetch and process data for each slug
for slug in all_slugs:
    url = f"{base_url_data}{slug}.json"
    try:
        response = requests.get(url, headers=headers_data, timeout=10)
        response.raise_for_status()
        json_data = response.json()

        # Extract required fields with fallback values
        target_beneficiaries = json_data["pageProps"]["schemeData"]["en"]["basicDetails"].get("targetBeneficiaries")
        if target_beneficiaries is None:
            target_beneficiaries = []
        scheme_data = {
            "Scheme Name": json_data["pageProps"]["schemeData"]["en"]["basicDetails"]["schemeName"],
            "Ministries/Departments": json_data["pageProps"]["schemeData"]["en"]["basicDetails"].get("nodalMinistryName") or
                                     json_data["pageProps"]["schemeData"]["en"]["basicDetails"].get("nodalDepartmentName", {}).get("label", "N/A") or "N/A",
            "Target Beneficiaries": ", ".join([b["label"] for b in target_beneficiaries]) or "N/A",
            "Eligibility Criteria": json_data["pageProps"]["schemeData"]["en"].get("eligibilityCriteria", {}).get("eligibilityDescription_md", "N/A"),
            "Description & Benefits": (json_data["pageProps"]["schemeData"]["en"]["schemeContent"].get("briefDescription", "") + "\n" +
                                      json_data["pageProps"]["schemeData"]["en"]["schemeContent"].get("benefits_md", "")) or "N/A",
            "Application Process": json_data["pageProps"]["schemeData"]["en"].get("applicationProcess", [{}])[0].get("process_md", "N/A"),
            "Tags": ", ".join([tag for tag in json_data["pageProps"]["schemeData"]["en"]["basicDetails"].get("tags", []) if tag is not None]) or "N/A"
        }
        schemes_data.append(scheme_data)
        print(f"Successfully processed {slug}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for {slug}: {e}")
    except (KeyError, IndexError) as e:
        print(f"Error parsing data for {slug}: {e}")
    time.sleep(1)  # Avoid rate limiting

# Save to JSON file
output_json_path = Path("schemes_data.json")
with output_json_path.open("w", encoding="utf-8") as json_file:
    json.dump(schemes_data, json_file, indent=4, ensure_ascii=False)
print(f"JSON data saved to {output_json_path}")

# Save to CSV file
output_csv_path = Path("schemes_data.csv")
with output_csv_path.open("w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["Scheme Name", "Ministries/Departments", "Target Beneficiaries", "Eligibility Criteria",
                  "Description & Benefits", "Application Process", "Tags"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(schemes_data)
print(f"CSV data saved to {output_csv_path}")

Extracted 10 slugs from page 1
Extracted 10 slugs from page 2
Extracted 10 slugs from page 3
Extracted 10 slugs from page 4
Extracted 10 slugs from page 5
Extracted 10 slugs from page 6
Extracted 10 slugs from page 7
Extracted 10 slugs from page 8
Extracted 10 slugs from page 9
Extracted 10 slugs from page 10
Extracted 10 slugs from page 11
Extracted 10 slugs from page 12
Extracted 10 slugs from page 13
Extracted 10 slugs from page 14
Extracted 10 slugs from page 15
Extracted 10 slugs from page 16
Extracted 10 slugs from page 17
Extracted 10 slugs from page 18
Extracted 10 slugs from page 19
Extracted 10 slugs from page 20
Extracted 10 slugs from page 21
Extracted 10 slugs from page 22
Extracted 10 slugs from page 23
Extracted 10 slugs from page 24
Extracted 10 slugs from page 25
Extracted 10 slugs from page 26
Extracted 10 slugs from page 27
Extracted 10 slugs from page 28
Extracted 10 slugs from page 29
Extracted 10 slugs from page 30
Extracted 10 slugs from page 31
Extracted 10 slug

In [2]:
import os
print(os.listdir('/kaggle/working/'))


['.virtual_documents', 'schemes_data.csv', 'schemes_data.json']


In [5]:
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
import sys

# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return "Not Available"
    soup = BeautifulSoup(str(text), "html.parser")
    return soup.get_text(separator="\n").strip()

# Load dataset
try:
    df = pd.read_csv('/kaggle/input/d1ddddd/schemes_data.csv')
except FileNotFoundError:
    print("Error: scheme.csv not found. Using sample data.")
    

# Create chunks with metadata
chunks = []
chunk_metadata = []
fields = [col for col in df.columns if col != "Scheme Name"]
for index, row in df.iterrows():
    scheme_name = str(row["Scheme Name"]).strip()
    for field in fields:
        chunk = f"Scheme: {scheme_name}\n{field}: {clean_text(row[field])}"
        chunks.append(chunk)
        chunk_metadata.append({"scheme_name": scheme_name, "field": field})

# Generate embeddings
embedding_model = SentenceTransformer('BAAI/bge-small-en')
embeddings = embedding_model.encode(chunks, convert_to_tensor=True, show_progress_bar=False).cpu().numpy()

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Load language model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Function to detect requested field
def detect_field(query):
    query_lower = query.lower()
    if "scheme" in query_lower or "schemes" in query_lower:
        return "Scheme Name"
    elif "eligibility" in query_lower or "criteria" in query_lower:
        return "Eligibility Criteria"
    elif "description" in query_lower or "benefits" in query_lower:
        return "Description & Benefits"
    elif "application" in query_lower or "process" in query_lower:
        return "Application Process"
    elif "ministry" in query_lower or "department" in query_lower:
        return "Ministries/Departments"
    elif "beneficiaries" in query_lower:
        return "Target Beneficiaries"
    elif "tags" in query_lower:
        return "Tags"
    return "Scheme Name"  # Default

# Function to generate answer
def generate_answer(query):
    target_field = detect_field(query)
    query_embedding = embedding_model.encode([query], convert_to_tensor=True, show_progress_bar=False).cpu().numpy()
    k = 10
    distances, indices = index.search(query_embedding, k)
    
    # Filter relevant chunks
    results = []
    scheme_names = set()
    for i in indices[0]:
        chunk = chunks[i]
        metadata = chunk_metadata[i]
        # Apply filters based on query (e.g., disabled and Kerala)
        if ("disabled" in query.lower() or "differently abled" in query.lower()) and "kerala" in query.lower():
            if not (("disabled" in chunk.lower() or "differently abled" in chunk.lower()) and "kerala" in chunk.lower()):
                continue
        elif "disabled" in query.lower() or "differently abled" in query.lower():
            if not ("disabled" in chunk.lower() or "differently abled" in chunk.lower()):
                continue
        elif "kerala" in query.lower():
            if "kerala" not in chunk.lower():
                continue
        
        if target_field == "Scheme Name":
            if metadata["scheme_name"] not in scheme_names:
                scheme_names.add(metadata["scheme_name"])
                results.append(metadata["scheme_name"])
        elif metadata["field"] == target_field:
            content = chunk.split(f"{target_field}: ")[1] if f"{target_field}: " in chunk else chunk
            results.append(content)
    
    # Generate answer
    if target_field == "Scheme Name":
        answer = ", ".join(results) if results else "No relevant schemes found."
    else:
        context = "\n\n".join(results) if results else "No relevant information found."
        input_text = f"Return only the {target_field} for schemes matching the question. Do not include other details.\nQuestion: {query}\nContext: {context}"
        input_ids = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True)
        output = model.generate(input_ids, max_length=150, num_beams=5, early_stopping=True)
        answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return answer

# Main: Get query and output answer
if len(sys.argv) > 1:
    query = " ".join(sys.argv[1:])
else:
    query = "What schemes are available for disabled students in Kerala?"  # Default for testing

answer = generate_answer(query)
print(answer)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Modernisation And Removal Of Obsolescence, MANAGE Internship Programme, Prime Minister's Employment Generation Programme, Junior Research Fellowship, Biotechnology Ignition Grant Scheme, Coaching Schemes For Sc/st/obc (non-creamy Layer) & Minority Students For Universities, Airavata Scheme, Prime Minister's Scholarship Scheme for Railway Protection Force, Krushy Aranya Protsaha Yojane (kapy)


In [4]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer
import sys

# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return "Not Available"
    soup = BeautifulSoup(str(text), "html.parser")
    return soup.get_text(separator="\n").strip()

# Load dataset
try:
    df = pd.read_csv('schemes_data.csv')
    print("Dataset loaded successfully. Columns:", df.columns)
except FileNotFoundError:
    print("Error: schemes_data.csv not found. Please ensure the file is in the working directory.")
    sys.exit(1)

# Create chunks with metadata
chunks = []
chunk_metadata = []
fields = [col for col in df.columns if col != "Scheme Name"]
for index, row in df.iterrows():
    scheme_name = str(row["Scheme Name"]).strip()
    for field in fields:
        chunk = f"Scheme: {scheme_name}\n{field}: {clean_text(row[field])}"
        chunks.append(chunk)
        chunk_metadata.append({"scheme_name": scheme_name, "field": field})

# Generate embeddings
embedding_model = SentenceTransformer('BAAI/bge-small-en')
embeddings = embedding_model.encode(chunks, convert_to_tensor=True, show_progress_bar=False).cpu().numpy()

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Load language model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Function to detect requested field
def detect_field(query):
    query_lower = query.lower()
    if "scheme" in query_lower or "schemes" in query_lower:
        return "Scheme Name"
    elif "eligibility" in query_lower or "criteria" in query_lower:
        return "Eligibility Criteria"
    elif "description" in query_lower or "benefits" in query_lower:
        return "Description & Benefits"
    elif "application" in query_lower or "process" in query_lower:
        return "Application Process"
    elif "ministry" in query_lower or "department" in query_lower:
        return "Ministries/Departments"
    elif "beneficiaries" in query_lower:
        return "Target Beneficiaries"
    elif "tags" in query_lower:
        return "Tags"
    return "Scheme Name"

# Function to generate answer
def generate_answer(query):
    target_field = detect_field(query)
    query_embedding = embedding_model.encode([query], convert_to_tensor=True, show_progress_bar=False).cpu().numpy()
    k = 10
    distances, indices = index.search(query_embedding, k)
    
    results = []
    scheme_names = set()
    for i in indices[0]:
        chunk = chunks[i]
        metadata = chunk_metadata[i]
        chunk_lower = chunk.lower()
        is_disabled = "disabled" in chunk_lower or "differently abled" in chunk_lower or "pwd" in chunk_lower
        is_kerala = "kerala" in chunk_lower
        if ("disabled" in query.lower() or "differently abled" in query.lower()) and "kerala" in query.lower():
            if not (is_disabled and is_kerala):
                continue
        elif "disabled" in query.lower() or "differently abled" in query.lower():
            if not is_disabled:
                continue
        elif "kerala" in query.lower():
            if not is_kerala:
                continue
        
        if target_field == "Scheme Name":
            if metadata["scheme_name"] not in scheme_names:
                scheme_names.add(metadata["scheme_name"])
                results.append(metadata["scheme_name"])
        elif metadata["field"] == target_field:
            content = chunk.split(f"{target_field}: ")[1] if f"{target_field}: " in chunk else chunk
            results.append(content)
    
    answer = ", ".join(results) if results else "No relevant schemes found."
    return answer

# Main: Get query and output answer
if len(sys.argv) > 1:
    query = " ".join(sys.argv[1:])
else:
    query = "What schemes are available for disabled students in Kerala?"

answer = generate_answer(query)
print(answer)

Dataset loaded successfully. Columns: Index(['Scheme Name', 'Ministries/Departments', 'Target Beneficiaries',
       'Eligibility Criteria', 'Description & Benefits', 'Application Process',
       'Tags'],
      dtype='object')
Modernisation And Removal Of Obsolescence, Development Of Playfield Under CMSGUY, MANAGE Internship Programme, Prime Minister's Employment Generation Programme, National Bamboo Mission, SERB International Research Experience, India Afghanistan Fellowship, Junior Research Fellowship, Biotechnology Ignition Grant Scheme, India Africa Fellowship


In [9]:
pip install pandas sentence-transformers faiss-cpu transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.11.0->sentence-transformers)
  Downloading nvid

In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load your CSV data
df = pd.read_csv('/kaggle/input/d1ddddd/schemes_data.csv')

# Combine relevant columns into a single text chunk per row
def make_chunk(row):
    return f"{row['Scheme Name']}. {row['Description & Benefits']} Eligibility: {row['Eligibility Criteria']}"

df['chunk'] = df.apply(make_chunk, axis=1)
chunks = df['chunk'].tolist()

# Use a small embedding model
embedder = SentenceTransformer('BAAI/bge-small-en-v1.5')  # Or 'all-MiniLM-L6-v2'

# Compute embeddings
embeddings = embedder.encode(chunks, convert_to_numpy=True)
dimension = embeddings.shape[1]

# Build FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
def retrieve(query, top_n=3):
    query_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_emb, top_n)
    return [chunks[i] for i in I[0]]


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Example with a small local model (adjust as needed)
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

def generate_answer(question, context_chunks):
    context = "\n".join(context_chunks)
    prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=150)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [13]:
question = "What schemes are available for farmers in Maharashtra?"

# Retrieve top relevant chunks
top_chunks = retrieve(question, top_n=3)

# Generate answer using LLM
answer = generate_answer(question, top_chunks)
print(answer)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Context:
Krushy Aranya Protsaha Yojane (kapy). The Karnataka Forest Department launched 'Krushi Aranya Protsaha Yojane (KAPY)' in 2011-12 to encourage farmers and the public to increase forest and tree cover by providing subsidized seedlings and monetary incentives for planting
 Eligibility: 
1. This scheme is open to farmers belonging to all communities.
1. Applicant must have Pahani of the land where planting is being proposed.
1. Registration should be done before the commencement of the rainy season (by the end of May).
1. The following species of trees are not eligible for payment of incentive amount- Eucalyptus, Acacia, Silver Oak (if planted in coffee estate), Casuarina, Cassia siamea (Seemetangadi), Gliricidia, Sesbania, Erythrina, Rubber, Subabul, Coconut, Arecanut, Orange, all types of citrus species and grafted mango.
1. The following documents are required while applying for this scheme - Copy of Aadhaar Card, Passport-sized photograph of the applicant, Pahani of the land w