In [1]:
import json
import torch
import gradio as gr
import re
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer, util

# Load Constitution Dataset
with open("/home/likitha/indian_constitution_full_qa.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print(f" Loaded {len(data)} articles from dataset.")

# Build Corpus for Semantic Search
corpus = [f"Article {item['article']} - {item['title']}. {item['description']}" for item in data]

# Load Models
print(" Loading models...")
qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2") 
qa_tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
encoder = SentenceTransformer("all-MiniLM-L6-v2")

# Precompute Embeddings
corpus_embeddings = encoder.encode(corpus, convert_to_tensor=True)
print(" Corpus embeddings ready.")

# QA Answer Function
def get_answer(question, context):
    inputs = qa_tokenizer.encode_plus(question, context, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = qa_model(**inputs)
    start = torch.argmax(outputs.start_logits)
    end = torch.argmax(outputs.end_logits) + 1
    if start >= end:
        return "⚠️ No clear answer found."
    return qa_tokenizer.decode(inputs["input_ids"][0][start:end], skip_special_tokens=True).strip()

# Chatbot Logic 
def chatbot(user_question):
    # Check for direct article number
    match = re.search(r"article\s+(\d+)", user_question.lower())
    if match:
        article_num = int(match.group(1))
        for item in data:
            if int(item['article']) == article_num:
                context = item["description"]
                title = f"{item['article']} - {item['title']}"
                answer = get_answer(user_question, context)
                return f"📘 {title}\n\n🧠 Answer: {answer}", f"📚 Context:\n\n{context}"
    
    # Fallback to semantic search
    question_embedding = encoder.encode(user_question, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(question_embedding, corpus_embeddings)[0]
    best_match_idx = torch.argmax(scores).item()
    
    best_item = data[best_match_idx]
    best_context = best_item["description"]
    article_title = f"{best_item['article']} - {best_item['title']}"
    answer = get_answer(user_question, corpus[best_match_idx])

    return f" {article_title}\n\n🧠 Answer: {answer}", f"📚 Context:\n\n{best_context}"

# Launch Gradio UI
gr.Interface(
    fn=chatbot,
    inputs=gr.Textbox(lines=2, placeholder="Ask a question about the Indian Constitution..."),
    outputs=["text", "text"],
    title="🇮🇳 Indian Constitution Chatbot (with Semantic Search + Article Match)",
    description="Ask legal questions related to the Constitution of India. Now finds exact Articles or uses semantic search if needed.",
).launch()


  from .autonotebook import tqdm as notebook_tqdm


 Loaded 465 articles from dataset.
 Loading models...


Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 Corpus embeddings ready.
* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


