In [1]:
# Install required modules
!pip install easyocr
!pip install PyMuPDF
!pip install fpdf
!pip install openai




In [2]:
!pip install google-generativeai



In [3]:
!pip install spellchecker



In [4]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
!pip install pyspellchecker



In [6]:
!pip install textstat



In [7]:
!pip install nltk



In [8]:
!pip install --upgrade pip setuptools wheel



In [9]:
!pip install indexer

Collecting indexer
  Downloading indexer-0.6.2.tar.gz (14 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [11]:
import easyocr
from fpdf import FPDF
import fitz  # PyMuPDF
from google.colab import files
import google.generativeai as genai
import re
import nltk
import spacy
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from textstat import textstat

# Download necessary NLTK resources with explicit download path
print("📦 Downloading NLTK resources...")
nltk.download('punkt')  # Just punkt, not punkt_tab
nltk.download('stopwords')
nltk.download('wordnet')
print("✅ NLTK resources downloaded")

# Make sure we're using the right tokenizers
from nltk.tokenize import word_tokenize, sent_tokenize

# Load spaCy model
try:
    nlp = spacy.load('en_core_web_sm')
    print("✅ spaCy model loaded")
except Exception as e:
    print(f"⚠️ spaCy model error: {e}")
    print("Installing spaCy model...")
    import os
    os.system('python -m spacy download en_core_web_sm')
    try:
        nlp = spacy.load('en_core_web_sm')
        print("✅ spaCy model loaded after installation")
    except:
        print("❌ Could not load spaCy model")
        # Fallback option
        nlp = None

# Initialize spellchecker
spell = SpellChecker()

# --- TEXT EXTRACTION FUNCTIONS ---

def extract_text_from_image(image_path):
    """Extract text from an image using EasyOCR"""
    reader = easyocr.Reader(['en'], gpu=False)
    result = reader.readtext(image_path, detail=0, paragraph=True)
    return "\n".join(result)

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF using PyMuPDF"""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

# --- NLP PROCESSING FUNCTIONS ---

import re

def clean_ocr_text(text):
    """Clean OCR-extracted text by fixing common character errors and formatting issues."""

    # --- STEP 1: Basic OCR format fixes ---
    text = re.sub(r'([A-Za-z])_([A-Za-z])', r'\1 \2', text)  # Underscores between letters → space
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)          # Lowercase followed by uppercase → space

    # --- STEP 2: Common static replacements ---
    replacements = {
        'Fo': 'to',
        'Fhe': 'the',
        'Fhis': 'this',
        'Fhat': 'that',
        'Falking': 'talking',
        'Fearning': 'learning',
        'Fhings': 'things',
        'Talso': 'I also',
        'Fhink': 'think',
        'lfe': 'life',
        '&th': '8th',
        'Zgo': 'I go',
        '1 am': 'I am',

        # Additional likely OCR-based typos
        'Teh': 'The',
        'Ths': 'This',
        'Fere': 'There',
        'Fom': 'From',
        'Fime': 'Time',
        'Foday': 'Today',
        'Fey': 'They',
        'Tey': 'They',
        'Ferefore': 'Therefore',
        'Frue': 'True',
        'Fust': 'Just',
        'l': 'I',  # lowercase L often used for uppercase I
        'i m': 'I am',
        'ive': 'I have',
        'i ve': 'I have',
        'thw': 'the',
        'thid': 'this',
        'thar': 'that',
        'woukd': 'would',
        'coud': 'could',
        'shoukd': 'should',
        'Sth': '8th',
        'gth': '9th',
        'Oth': '10th'
    }

    for wrong, right in replacements.items():
        text = text.replace(wrong, right)

    # --- STEP 3: Regex-based dynamic corrections ---
    # Replace 'F' + vowel-starting word → 'Th'
    text = re.sub(r'\bF(?=[aeiouAEIOU])', 'Th', text)

    # Replace '1' at word beginning with 'I' (common OCR confusion)
    text = re.sub(r'\b1(?=\w+)', 'I', text)

    # Replace standalone lowercase l with uppercase I
    text = re.sub(r'\bl\b', 'I', text)

    # Fix weird ordinal numbers (e.g., &th)
    text = re.sub(r'&th', '8th', text)

    # Fix "Z" at start of word, usually meant to be "I"
    text = re.sub(r'\bZ(?=\w+)', 'I', text)

    # --- STEP 4: Whitespace cleanup ---
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)

    return text.strip()


def correct_spelling(text):
    """Correct spelling errors in the text"""
    # Simple tokenization using Python's split to avoid NLTK issues
    words = text.split()
    corrected_words = []

    for word in words:
        # Only check words with letters (not numbers or punctuation alone)
        if re.search('[a-zA-Z]', word):
            # Remove punctuation attached to the word for spell checking
            clean_word = re.sub(r'[^\w\s]', '', word)

            if clean_word and clean_word.lower() not in ['i', 'a', 'an', 'the', 'and', 'or', 'but', 'to', 'for', 'in', 'on', 'at', 'by']:
                misspelled = spell.unknown([clean_word])
                if misspelled:
                    # Get the most likely correction
                    correction = spell.correction(clean_word)
                    if correction:
                        # Replace just the word part, maintaining original punctuation
                        corrected_word = word.replace(clean_word, correction)
                        corrected_words.append(corrected_word)
                    else:
                        corrected_words.append(word)
                else:
                    corrected_words.append(word)
            else:
                corrected_words.append(word)
        else:
            corrected_words.append(word)

    return ' '.join(corrected_words)

def fix_grammar_with_spacy(text):
    """Use spaCy for basic grammar correction"""
    if nlp is None:
        # Fallback basic correction if spaCy is not available
        sentences = re.split(r'(?<=[.!?])\s+', text)
        corrected_sentences = []

        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:
                # Capitalize first letter
                if sentence[0].isalpha() and not sentence[0].isupper():
                    sentence = sentence[0].upper() + sentence[1:]

                # Add period if missing ending punctuation
                if sentence[-1] not in ['.', '!', '?']:
                    sentence += '.'

                corrected_sentences.append(sentence)

        return ' '.join(corrected_sentences)

    try:
        doc = nlp(text)
        sentences = []

        for sent in doc.sents:
            # Convert to string and capitalize first letter
            sentence = sent.text.strip()
            if sentence:
                if sentence[0].isalpha() and not sentence[0].isupper():
                    sentence = sentence[0].upper() + sentence[1:]

                # Make sure sentence ends with punctuation
                if sentence[-1] not in ['.', '!', '?']:
                    sentence += '.'

                sentences.append(sentence)

        return ' '.join(sentences)
    except Exception as e:
        print(f"⚠️ Grammar correction error: {e}")
        return text  # Return original text if there's an error

def analyze_text_complexity(text):
    """Analyze readability of the text"""
    if len(text.strip()) == 0:
        return "No text to analyze"

    try:
        results = {
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "automated_readability_index": textstat.automated_readability_index(text)
        }

        analysis = f"Readability Analysis:\n"
        analysis += f"- Flesch Reading Ease: {results['flesch_reading_ease']:.1f}/100 "

        if results['flesch_reading_ease'] > 90:
            analysis += "(Very Easy to Read)\n"
        elif results['flesch_reading_ease'] > 80:
            analysis += "(Easy to Read)\n"
        elif results['flesch_reading_ease'] > 70:
            analysis += "(Fairly Easy to Read)\n"
        elif results['flesch_reading_ease'] > 60:
            analysis += "(Standard/Plain English)\n"
        elif results['flesch_reading_ease'] > 50:
            analysis += "(Fairly Difficult to Read)\n"
        elif results['flesch_reading_ease'] > 30:
            analysis += "(Difficult to Read)\n"
        else:
            analysis += "(Very Difficult to Read)\n"

        analysis += f"- Grade Level: {results['flesch_kincaid_grade']:.1f}\n"
        analysis += f"- Automated Readability Index: {results['automated_readability_index']:.1f}"

        return analysis
    except Exception as e:
        print(f"⚠️ Readability analysis error: {e}")
        return "Unable to analyze text complexity"

def extract_key_entities(text):
    """Extract named entities from the text using spaCy"""
    if nlp is None:
        return "Entity extraction not available (spaCy model not loaded)"

    try:
        doc = nlp(text)
        entities = {}

        for ent in doc.ents:
            entity_type = ent.label_
            if entity_type not in entities:
                entities[entity_type] = []
            if ent.text not in entities[entity_type]:
                entities[entity_type].append(ent.text)

        if not entities:
            return "No named entities found in the text"

        result = "Named Entities:\n"
        for entity_type, items in entities.items():
            result += f"- {entity_type}: {', '.join(items)}\n"

        return result
    except Exception as e:
        print(f"⚠️ Entity extraction error: {e}")
        return "Unable to extract entities"

def process_text_with_nlp(text):
    """Process text using various NLP techniques"""
    print("1. Cleaning OCR artifacts...")
    # Initial cleaning of OCR artifacts
    text = clean_ocr_text(text)

    print("2. Correcting spelling...")
    # Correct spelling
    text = correct_spelling(text)

    print("3. Fixing grammar...")
    # Fix basic grammar issues
    text = fix_grammar_with_spacy(text)

    print("4. Analyzing text complexity...")
    # Additional analysis
    complexity_analysis = analyze_text_complexity(text)

    print("5. Extracting entities...")
    entity_analysis = extract_key_entities(text)

    # Return processed text and analysis
    return {
        "processed_text": text,
        "complexity_analysis": complexity_analysis,
        "entity_analysis": entity_analysis
    }

# --- PDF GENERATION FUNCTION ---

def create_pdf_from_text(text, analysis, output_pdf):
    """Create a PDF from extracted text and analysis"""
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)

    # Add original processed text
    pdf.set_font("Arial", 'B', size=14)
    pdf.cell(0, 10, "Processed Text", ln=True)
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text)

    # Add analysis section
    pdf.add_page()
    pdf.set_font("Arial", 'B', size=14)
    pdf.cell(0, 10, "Text Analysis", ln=True)

    pdf.set_font("Arial", 'B', size=12)
    pdf.cell(0, 10, "Complexity Analysis", ln=True)
    pdf.set_font("Arial", size=11)
    pdf.multi_cell(0, 8, analysis["complexity_analysis"])

    pdf.set_font("Arial", 'B', size=12)
    pdf.cell(0, 10, "Entity Analysis", ln=True)
    pdf.set_font("Arial", size=11)
    pdf.multi_cell(0, 8, analysis["entity_analysis"])

    # Add AI suggestions if available
    if "ai_suggestions" in analysis:
        pdf.add_page()
        pdf.set_font("Arial", 'B', size=14)
        pdf.cell(0, 10, "AI Improvement Suggestions", ln=True)
        pdf.set_font("Arial", size=12)
        pdf.multi_cell(0, 10, analysis["ai_suggestions"])

    pdf.output(output_pdf)

# --- GEMINI IMPROVEMENT SUGGESTIONS ---
def suggest_improvements_with_gemini(extracted_text, api_key):
    """
    Prompt the user to enter a custom instruction, then use Gemini API
    to generate suggestions or improvements on the extracted text.
    """
    # Ask user to enter their prompt
    print("\n📥 Please enter your instruction or prompt for Gemini (e.g., 'Fix grammar and summarize'):")
    user_prompt = input("📝 Your Prompt: ").strip()

    genai.configure(api_key=api_key)

    try:
        model = genai.GenerativeModel('gemini-2.0-flash')

        # Limit extracted text to avoid overload
        trimmed_text = extracted_text[:1500]

        # Create the full prompt
        full_prompt = f"""{user_prompt}

OCR-extracted text:
{trimmed_text}
"""

        # Generate content
        response = model.generate_content(full_prompt)
        return response.text

    except Exception as e:
        print(f"⚠️ Gemini API Error: {e}")
        return f"Unable to generate suggestions using Gemini. Error: {str(e)}"


# --- MAIN PROCESSING FUNCTION ---

def process_document(file_path, file_type, output_pdf, api_key=None):
    """Extract text, process with NLP, and generate a PDF from it"""
    # Extract raw text from document
    if file_type == 'image':
        extracted_text = extract_text_from_image(file_path)
    elif file_type == 'pdf':
        extracted_text = extract_text_from_pdf(file_path)
    else:
        extracted_text = ""
        print("❌ Unsupported file type")
        return None

    print("\n=== 📝 Raw Extracted Text ===")
    print(extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text)

    # Process text with NLP
    print("\n=== 🧠 Processing with NLP... ===")
    nlp_results = process_text_with_nlp(extracted_text)

    processed_text = nlp_results["processed_text"]
    print("\n=== 📝 NLP Processed Text ===")
    print(processed_text[:500] + "..." if len(processed_text) > 500 else processed_text)

    # Add complexity and entity analysis
    print("\n=== 📊 Text Analysis ===")
    print(nlp_results["complexity_analysis"])
    print("\n" + nlp_results["entity_analysis"])

    analysis_results = nlp_results

    # Get AI suggestions if API key is provided
    if api_key:
        print("\n=== 🤖 Requesting AI suggestions... ===")
        suggestions = suggest_improvements_with_gemini(processed_text, api_key)
        analysis_results["ai_suggestions"] = suggestions
        print("\n=== 💡 AI Suggestions ===")
        print(suggestions)

    # Create and save PDF
    create_pdf_from_text(processed_text, analysis_results, output_pdf)

    return {
        "raw_text": extracted_text,
        "processed_text": processed_text,
        "analysis": analysis_results
    }

# --- INTERACTIVE EXECUTION ---

# Install required packages if they're not already installed
try:
    import pkg_resources
    required_packages = ['spacy', 'textstat', 'pyspellchecker']
    installed = {pkg.key for pkg in pkg_resources.working_set}
    missing = [pkg for pkg in required_packages if pkg.lower() not in installed]

    if missing:
        print(f"📦 Installing missing packages: {', '.join(missing)}")
        import os
        os.system(f"pip install {' '.join(missing)}")
        print("✅ Packages installed successfully")
except Exception as e:
    print(f"⚠️ Package check error: {e}")

# Input Gemini API Key (optional)
api_key = input("🔑 Enter your Gemini API key (press Enter to skip AI suggestions): ").strip()

if api_key:
    print("✅ API key received.")
else:
    print("⏩ Skipping AI suggestions.")

print("\n📤 Upload a file (image or PDF) to process:")
uploaded = files.upload()

if uploaded:
    filename = list(uploaded.keys())[0]
    file_extension = filename.split('.')[-1].lower()

    if file_extension in ['jpg', 'jpeg', 'png', 'bmp']:
        file_type = 'image'
    elif file_extension in ['pdf']:
        file_type = 'pdf'
    else:
        print("❌ Unsupported file format.")
        file_type = None

    if file_type:
        print(f"\n📄 Processing {filename}...")
        output_pdf = "processed_document_with_nlp.pdf"

        # Process the document with all NLP features
        results = process_document(filename, file_type, output_pdf, api_key if api_key else None)

        if results:
            print("\n⬇️ Downloading PDF...")
            files.download(output_pdf)
            print("\n✅ Processing complete!")
        else:
            print("\n❌ Processing failed.")

📦 Downloading NLTK resources...
✅ NLTK resources downloaded


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ spaCy model loaded
🔑 Enter your Gemini API key (press Enter to skip AI suggestions): AIzaSyDtws4ifcnac0tnj7Z6-rpdxYeviy6aOPA
✅ API key received.

📤 Upload a file (image or PDF) to process:




Saving p3.jpg to p3 (1).jpg

📄 Processing p3 (1).jpg...

=== 📝 Raw Extracted Text ===
  DEA# GB 05455616  LIC # 976269    MEDICAL CENTRE  New York; NY 91743, USA 824 14u Street NAME Jola Smitl AGE 34 ADDRESS 162 Example St, NT DATE 09-11-12   Betaloc I0O~3 -144L Bid  Dorzolamizvm I0 ~J +45 Bid Cinetizine 50 ~J 2 +4L, TID Oxprelol 50~a t45 QD e     1  Dc. Steve_JoLason signature OLABEL  1 REFILL 0(12 3 4 5 PRN WTXSNY PRES7OO 1

=== 🧠 Processing with NLP... ===
1. Cleaning OCR artifacts...
2. Correcting spelling...
3. Fixing grammar...
4. Analyzing text complexity...
5. Extracting entities...

=== 📝 NLP Processed Text ===
Dead# go 05455616 lie # 976269 MEDICAL center New York; my 91743, us 824 you Street NAME join smite AGE 34 ADDRESS. I example St, it DATE 09-I1-I2 betaine I0O~3. -i'll. Bid DorzoIamizvm i ~J +45 Bid cimetidine 50 ~J 2 +al, did OxpreIoI 50~a the ad e 1 Dc. Steve Jo jason signature label 1 REFILL 0(I2 3 4 5 pin WTXSNY preston 1.

=== 📊 Text Analysis ===
Readability Analys

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Processing complete!


In [13]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m93.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
Installing collected packages: watchdog, pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.44.1 watchdog-6.0.0
