<a href="https://colab.research.google.com/github/ray-islam/generativeAI/blob/main/Simple_LLM_Report_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip -q uninstall -y numpy
!pip -q uninstall -y transformers tokenizers safetensors accelerate
!pip -q uninstall -y scipy scikit-learn pandas
!pip -q cache purge

[0m

In [10]:
!pip -q install --no-cache-dir "numpy==2.1.3"
!pip -q install --no-cache-dir "transformers==4.44.2" "tokenizers==0.19.1" "accelerate==0.33.0" "safetensors>=0.4.3" "pypdf==4.2.0"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m236.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.1 requires pandas>=0.25.0, which is not installed.
tsfresh 0.21.1 requires scikit-learn>=0.22.0, which is not installed.
tsfresh 0.21.1 requires scipy>=1.14.0; python_version >= "3.10", which is not installed.
tensorflow-decision-forests 1.12.0 requires pandas, which is not installed.
pointpats 2.5.2 requires pandas!=1.5.0,>=1.4, which is not installed.
pointpats 2.5.2 requires scipy>=1.10, which is not installed.
cmdstanpy 1.3.0 requires pandas, which is not installed.
cvxpy 1.6.7 requires scipy>=1.11.0, which is not installed.
datasets 4.0.0 requires pa

In [None]:
import os, sys
os.kill(os.getpid(), 9)

In [1]:
import numpy as np
import transformers
import tokenizers
import accelerate

print("numpy:", np.__version__)
print("transformers:", transformers.__version__)
print("tokenizers:", tokenizers.__version__)
print("accelerate:", accelerate.__version__)


numpy: 1.26.4
transformers: 4.44.2
tokenizers: 0.19.1
accelerate: 0.33.0


In [2]:
import re, time, textwrap
from datetime import datetime
from pathlib import Path

import torch
from pypdf import PdfReader
from transformers import pipeline, AutoTokenizer


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [3]:
def clean_text(s: str) -> str:
    s = s.replace("\x00", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def read_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    pages = []
    for p in reader.pages:
        pages.append(p.extract_text() or "")
    return "\n".join(pages)

def read_txt_text(txt_path: str) -> str:
    return Path(txt_path).read_text(encoding="utf-8", errors="ignore")

def find_files(base_dir="/content", exts=(".pdf", ".txt")):
    base = Path(base_dir)
    found = []
    for ext in exts:
        found.extend(base.rglob(f"*{ext}"))
    return sorted(found)

def choose_file_interactive(base_dir="/content"):
    files = find_files(base_dir=base_dir, exts=(".pdf", ".txt"))
    if not files:
        raise FileNotFoundError(f"No .pdf/.txt found under {base_dir}. Upload in the Files panel.")
    print("Found files:\n")
    for i, p in enumerate(files, 1):
        print(f"{i:>2}. {p}")
    idx = int(input("\nEnter file number to summarize: ").strip())
    if idx < 1 or idx > len(files):
        raise ValueError("Invalid selection.")
    return str(files[idx-1])


In [4]:
MODEL_ID = "facebook/bart-large-cnn"

device = 0 if torch.cuda.is_available() else -1
print("CUDA available:", torch.cuda.is_available())

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

summarizer = pipeline(
    "summarization",
    model=MODEL_ID,
    tokenizer=tokenizer,
    device=device
)


CUDA available: False


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [5]:
def chunk_by_tokens(text: str, tokenizer, max_input_tokens: int = 900, overlap_tokens: int = 60):
    text = clean_text(text)
    if not text:
        return []

    ids = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(ids):
        end = min(start + max_input_tokens, len(ids))
        chunk_ids = ids[start:end]
        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
        chunks.append(chunk_text.strip())

        if end == len(ids):
            break
        start = max(0, end - overlap_tokens)

    return chunks

def summarize_chunks(summarizer, chunks, max_len=160, min_len=60, sleep_s=0.0):
    out = []
    n = len(chunks)
    for i, c in enumerate(chunks, 1):
        if len(c) < 200:
            continue
        res = summarizer(
            c,
            max_length=max_len,
            min_length=min_len,
            do_sample=False,
            truncation=True
        )[0]["summary_text"]
        out.append(res.strip())
        print(f"✓ Chunk {i}/{n} summarized.")
        if sleep_s:
            time.sleep(sleep_s)
    return out

def final_compress(summarizer, text: str, tokenizer, max_input_tokens=700):
    text = clean_text(text)
    if not text:
        return ""
    chunks = chunk_by_tokens(text, tokenizer, max_input_tokens=max_input_tokens, overlap_tokens=40)
    finals = summarize_chunks(summarizer, chunks, max_len=200, min_len=80)
    return clean_text(" ".join(finals))


In [7]:
from google.colab import files

path = choose_file_interactive("/content")
print("\nSelected:", path)

if path.lower().endswith(".pdf"):
    raw_text = read_pdf_text(path)
elif path.lower().endswith(".txt"):
    raw_text = read_txt_text(path)
else:
    raise ValueError("Pick a .pdf or .txt")

raw_text = clean_text(raw_text)
print(f"\nLoaded {len(raw_text):,} characters.")
print(raw_text[:800] + ("..." if len(raw_text) > 800 else ""))

CHUNK_INPUT_TOKENS = 900
chunks = chunk_by_tokens(raw_text, tokenizer, max_input_tokens=CHUNK_INPUT_TOKENS, overlap_tokens=60)
print("\nTotal chunks:", len(chunks))

chunk_summaries = summarize_chunks(summarizer, chunks, max_len=160, min_len=60)

intermediate = "\n\n".join(chunk_summaries)
print("\n--- Intermediate (first 800 chars) ---\n")
print(intermediate[:800] + ("..." if len(intermediate) > 800 else ""))

final_summary = final_compress(summarizer, intermediate, tokenizer, max_input_tokens=700)

print("\n======= EXECUTIVE SUMMARY =======\n")
print(textwrap.fill(final_summary, 100))

prompt = (
    "HEADLINE AND BULLETS:\n"
    + final_summary
    + "\n\nWrite a short headline and 5 bullet points with key findings."
)
headline_bullets = summarizer(prompt, max_length=220, min_length=120, do_sample=False, truncation=True)[0]["summary_text"]

stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
base = Path(path).stem

out_exec = f"{base}_executive_summary_{stamp}.txt"
out_chunks = f"{base}_chunk_summaries_{stamp}.txt"
out_hb = f"{base}_headline_bullets_{stamp}.txt"

Path(out_exec).write_text(final_summary, encoding="utf-8")
Path(out_chunks).write_text(intermediate, encoding="utf-8")
Path(out_hb).write_text(headline_bullets, encoding="utf-8")

files.download(out_exec)
files.download(out_chunks)
files.download(out_hb)


Found files:

 1. /content/AI-CS-Detailed-Technical-Workshop-Report-2020.pdf

Enter file number to summarize: 1

Selected: /content/AI-CS-Detailed-Technical-Workshop-Report-2020.pdf

Loaded 76,708 characters.
Artificial Intelligence and Cybersecurity : A Detailed Technical Workshop Report A 2019 WORKSHOP REPORT The Networking & Information Technology R &D Program June 2020 NITRD Artificial Intelligence and Cybersecurity: 2019 Workshop Report i Table of Contents Executive Summary ................................ ................................ ................................ ............................ ii Introduction ................................ ................................ ................................ ................................ ......1 Security of AI ................................ ................................ ................................ ................................ .....1 Specification and Verification of AI Systems ................................ 

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Total chunks: 17
✓ Chunk 1/17 summarized.
✓ Chunk 2/17 summarized.
✓ Chunk 3/17 summarized.
✓ Chunk 4/17 summarized.
✓ Chunk 5/17 summarized.
✓ Chunk 6/17 summarized.
✓ Chunk 7/17 summarized.
✓ Chunk 8/17 summarized.
✓ Chunk 9/17 summarized.
✓ Chunk 10/17 summarized.
✓ Chunk 11/17 summarized.
✓ Chunk 12/17 summarized.
✓ Chunk 13/17 summarized.
✓ Chunk 14/17 summarized.
✓ Chunk 15/17 summarized.
✓ Chunk 16/17 summarized.
✓ Chunk 17/17 summarized.

--- Intermediate (first 800 chars) ---

The National Information Technology and Networking Research and Development (NITRD) Program’s Art ificial Intelligence Research and. Development ( R&D ) and Cyber Security and Information Assurance Interagency Working Groups (IWG ) held a workshop1 to assess the research challenges and opportunities at the intersection of cybersecurity and artificial intelligence (AI) Technology is at an inflection point in history. AI investments must advance the theory and practice of secure AI -enabled system constru

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>