<a href="https://colab.research.google.com/github/radhika004/ISL_translator/blob/master/book_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import ipywidgets as widgets
widgets.IntProgress(value=10)

IntProgress(value=10)

In [None]:
!pip install pdfplumber ftfy

import pdfplumber, json, re, os
from ftfy import fix_text

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_si

In [None]:
# EDIT: path to your PDF file
pdf_path = "/content/Indian_Law_For_A_Common_Man.pdf"

def clean_text(s):
    s = fix_text(s)
    s = re.sub(r'\s+', ' ', s)
    s = s.strip()
    # remove page numbers like "123" on lines by themselves
    s = re.sub(r'\bPage\s*\d+\b', '', s, flags=re.I)
    return s

paras = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text() or ""
        text = clean_text(text)
        if not text:
            continue

        for chunk in re.split(r'\n{2,}', text):
            chunk = chunk.strip()
            if len(chunk) < 40:   # skip tiny junk
                continue
            paras.append(chunk)

In [None]:

def split_long(para, max_len=800):
    if len(para) <= max_len:
        return [para]
    sents = re.split(r'(?<=[。.!?])\s+', para)
    buckets, cur = [], ""
    for s in sents:
        if len(cur) + len(s) + 1 <= max_len:
            cur = (cur + " " + s).strip()
        else:
            if cur:
                buckets.append(cur)
            cur = s
    if cur:
        buckets.append(cur)
    return buckets

final_paras = []
for p in paras:
    final_paras.extend(split_long(p))


In [None]:
# save as jsonl
out_path = "corpus_paragraphs.jsonl"
with open(out_path, "w", encoding="utf-8") as f:
    for p in final_paras:
        json.dump({"paragraph": p}, f, ensure_ascii=False)
        f.write("\n")

print(f"Saved {len(final_paras)} paragraphs to {out_path}")

Saved 450 paragraphs to corpus_paragraphs.jsonl


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import json


In [None]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from tqdm import tqdm

# Load model ONCE
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")

qg_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Read paragraphs
paragraphs = []
with open("corpus_paragraphs.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        paragraphs.append(json.loads(line)["paragraph"])


batch_size = 8
final_dataset = []

for i in tqdm(range(0, len(paragraphs), batch_size), desc="Generating Questions"):
    batch = paragraphs[i:i+batch_size]

    prompts = [f"Generate 3 legal questions that can be answered using only the paragraph:\n\n{p}" for p in batch]
    outputs = qg_pipe(prompts, max_new_tokens=256, do_sample=True, temperature=0.7)

    # Process each batch output
    for paragraph, out in zip(batch, outputs):
        questions = out["generated_text"].split("\n")
        questions = [q.strip("- ").strip() for q in questions if q.strip()]


        ans_prompts = [
            f"Answer the question based ONLY on the paragraph:\nParagraph: {paragraph}\nQuestion: {q}\nGenerate 2 answer variants."
            for q in questions
        ]

        ans_outputs = qg_pipe(ans_prompts, max_new_tokens=256, do_sample=True, temperature=0.7)

        for q, ans_out in zip(questions, ans_outputs):
            answers = ans_out["generated_text"].split("\n")
            answers = [a.strip("- ").strip() for a in answers if a.strip()]

            for ans in answers:
                final_dataset.append({"question": q, "answer": ans})

# Save final dataset
with open("generated_qna.json", "w", encoding="utf-8") as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=2)

print(f"✅ QnA dataset saved with {len(final_dataset)} entries")


Device set to use cuda:0
Generating Questions: 100%|██████████| 57/57 [19:38<00:00, 20.67s/it]

✅ QnA dataset saved with 450 entries





In [None]:
# Save final dataset
with open("generated_qna.json", "w", encoding="utf-8") as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=2)

print(f"✅ QnA dataset saved with {len(final_dataset)} entries")


✅ QnA dataset saved with 450 entries
