In [1]:
!pip install pdfplumber python-docx Pillow transformers -q
!apt-get install -y poppler-utils tesseract-ocr -qq
!pip install pytesseract -q



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hSelecting previously unselected package poppler-utils.
(Reading database ... 126319 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0

In [None]:
import pytesseract
from PIL import Image
import docx
import pdfplumber
import io
import json
import base64

from transformers import pipeline
from google.colab import files


In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


In [None]:
def extract_text(file_name, file_type):
    if file_type == ".pdf":
        with pdfplumber.open(file_name) as pdf:
            return "\n".join([page.extract_text() or "" for page in pdf.pages])

    elif file_type == ".docx":
        doc = docx.Document(file_name)
        return "\n".join([p.text for p in doc.paragraphs])

    elif file_type in [".png", ".jpg", ".jpeg"]:
        img = Image.open(file_name)
        return pytesseract.image_to_string(img)

    elif file_type == ".txt":
        with open(file_name, 'r', encoding="utf-8") as f:
            return f.read()

    else:
        return None


In [None]:
def generate_metadata(text):
    short_text = text[:3000]  # Reduce input for summarizer
    summary = summarizer(short_text, max_length=120, min_length=30, do_sample=False)[0]['summary_text']
    keywords = list(set(summary.lower().split()))

    metadata = {
        "title": summary.split('.')[0],
        "summary": summary,
        "keywords": keywords[:10],
        "author": "Unknown",
        "date": "Unknown"
    }
    return metadata


In [None]:
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
print(f"📁 Uploaded: {file_name}")





In [None]:
import os

if 'file_name' in globals():
    if '.' in file_name:
        name_parts = os.path.splitext(file_name)
        if len(name_parts) == 2:
            _, ext = name_parts
            text = extract_text(file_name, ext.lower())

            if not text or len(text.strip()) < 20:
                print("❌ Could not extract enough text. Try another file.")
            else:
                metadata = generate_metadata(text)
                print("✅ Metadata Generated Successfully:\n")
                print(json.dumps(metadata, indent=4))

                # Save and download metadata
                with open("metadata.json", "w") as f:
                    json.dump(metadata, f, indent=4)

                files.download("metadata.json")
        else:
            print("❌ File name does not have a valid extension.")
    else:
        print("❌ File does not seem to have an extension.")
else:
    print("❌ File was not uploaded correctly. Please re-upload.")

