<a href="https://colab.research.google.com/github/sathu0622/25-26J-438-AI-Powered-LMS-for-Visually-Impaired-Students/blob/AI-Powered-System-for-Voice-Based-Resource-Type-Summarization-of-Historical-Content-for-VIS/Fuction_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ============================================================
# 1️⃣ INSTALL DEPENDENCIES
# ============================================================
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract pdf2image opencv-python pillow transformers torch tensorflow textblob


Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://cli.github.com/packages stable InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.6 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64

In [None]:
# ============================================================
# 2️⃣ IMPORTS
# ============================================================
import cv2
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import json
import os
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, T5ForConditionalGeneration
import re

pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

# ============================================================
# 3️⃣ OCR EXTRACTION
# ============================================================
def ocr_image(image_path):
    img = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    raw_text = pytesseract.image_to_string(img_rgb, lang="eng")
    return raw_text

def ocr_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    text_pages = []
    for i, page in enumerate(pages):
        temp_img = f"/content/temp_page_{i}.png"
        page.save(temp_img, "PNG")
        text_pages.append(ocr_image(temp_img))
        os.remove(temp_img)
    return " ".join(text_pages)

def extract_text(input_path):
    if input_path.lower().endswith(".pdf"):
        return ocr_pdf(input_path)
    else:
        return ocr_image(input_path)

# ============================================================
# 4️⃣ TRANSFORMER-BASED OCR + GRAMMAR CORRECTION
# ============================================================
grammar_model_name = "prithivida/grammar_error_correcter_v1"
grammar_tokenizer = AutoTokenizer.from_pretrained(grammar_model_name)
grammar_model = AutoModelForSeq2SeqLM.from_pretrained(grammar_model_name)

def correct_text(text):
    inputs = grammar_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = grammar_model.generate(inputs['input_ids'], max_length=1024)
    corrected_text = grammar_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# ============================================================
# 5️⃣ RESOURCE TYPE DETECTION MODEL
# ============================================================
IMG_SIZE = (224, 224)
class_names = ['Books', 'Magazine', 'Newspapers']
type_model_path = "/content/drive/MyDrive/Image/book_magazine_newspaper_model_super_finetuned2.keras"
type_model = load_model(type_model_path)

def predict_resource_type(img_path):
    img = image.load_img(img_path, target_size=IMG_SIZE)
    arr = image.img_to_array(img) / 255.0
    arr = np.expand_dims(arr, 0)
    pred = type_model.predict(arr)
    cls = class_names[np.argmax(pred)]
    conf = float(np.max(pred))
    return cls.lower(), conf

# ============================================================
# 6️⃣ T5 SUMMARIZATION MODEL
# ============================================================
summ_model_path = "/content/drive/MyDrive/t5_summarization_model/checkpoint-1715"
summ_tokenizer = T5Tokenizer.from_pretrained(summ_model_path)
summ_model = T5ForConditionalGeneration.from_pretrained(summ_model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
summ_model.to(device)

def get_prefix(type_name):
    if type_name == "newspapers":
        return "summarize: short summary: "
    elif type_name == "magazine":
        return "summarize: medium summary: "
    elif type_name == "books":
        return "summarize: long summary in detail: "
    return "summarize: "

def summarize_text(text, source_type):
    prefix = get_prefix(source_type)
    input_text = prefix + text

    inputs = summ_tokenizer(
        input_text,
        return_tensors="pt",
        max_length=1024,
        truncation=True,
        padding="max_length"
    ).to(device)

    output_ids = summ_model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=300,
        num_beams=4,
        early_stopping=True
    )
    return summ_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ============================================================
# 7️⃣ SPLIT NEWSPAPER INTO ARTICLES
# ============================================================
def split_into_articles(text):
    paragraphs = re.split(r'\n{1,2}', text)
    blocks = []
    current_block = ""
    for p in paragraphs:
        if len(p.strip()) == 0:
            continue
        if re.match(r'^[A-Z0-9]{2,}', p.strip()):
            if current_block:
                blocks.append(current_block.strip())
            current_block = p.strip()
        else:
            current_block += " " + p.strip()
    if current_block:
        blocks.append(current_block.strip())
    return blocks

# ============================================================
# 8️⃣ FULL PIPELINE (NO ATHLETES/MEDALS)
# ============================================================
def process_document(input_path):
    # Step 1: Extract raw text
    raw_text = extract_text(input_path)

    # Step 2: Correct OCR + grammar
    corrected_text = correct_text(raw_text)

    # Step 3: Detect resource type
    if input_path.lower().endswith(".pdf"):
        page = convert_from_path(input_path)[0]
        temp_img = "/content/temp_detect.jpg"
        page.save(temp_img, "JPEG")
        resource_type, conf = predict_resource_type(temp_img)
        os.remove(temp_img)
    else:
        resource_type, conf = predict_resource_type(input_path)

    # Step 4: Split if newspaper
    articles = split_into_articles(corrected_text) if resource_type == "newspapers" else [corrected_text]

    # Step 5: Summaries
    summaries = [summarize_text(art, resource_type) for art in articles]

    # Step 6: FINAL JSON (clean)
    final_output = {
        "resource_type": resource_type,
        "confidence": conf,
        "extracted_text": corrected_text,
        "summaries": summaries
    }

    return json.dumps(final_output, ensure_ascii=False, indent=2)

# ============================================================
# 9️⃣ RUN EXAMPLE
# ============================================================
input_file = "/content/page_151.jpg"
result = process_document(input_file)
print(result)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
{
  "resource_type": "newspapers",
  "confidence": 0.6855810284614563,
  "extracted_text": "The President said that the Bank of Botswana, during their 50th Anniversary, unveiled a commemorative P50 note bearing the names of the athletes, and that the team was a living example of resilience. “And so, on that rainy night in Tokyo, when others feared the",
  "summaries": [
    "During their 50th anniversary, the Bank of Botswana unveiled a commemorative P50 note bearing the names of the athletes. The team was a living example of resilience, and on that rainy night in Tokyo, others feared the rain."
  ]
}
