In [13]:
import pytesseract
import fitz  # PyMuPDF
import re
import os
from PIL import Image
from io import BytesIO
import filetype
from pygments.lexers import guess_lexer
from pygments.util import ClassNotFound

def extract_text_from_image(image_path):
    """Extract text from an image using Tesseract OCR"""
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF using PyMuPDF"""
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

def is_code(text):
    """Determine if extracted text is source code"""
    # Simple heuristic: check for code-like patterns
    patterns = [
        r"def\s+\w+\(",  # Python function definition
        r"#include\s+<\w+>",  # C/C++ header includes
        r"public\s+class\s+\w+",  # Java class definition
        r"<\s*\w+\s*>",  # HTML tags
        r"\bint\s+\w+",  # Common variable declaration in C-like languages
    ]
    for pattern in patterns:
        if re.search(pattern, text):
            return True

    # Try identifying the programming language using Pygments
    try:
        lexer = guess_lexer(text)
        return True if lexer else False
    except ClassNotFound:
        return False

def detect_code_in_file(file_path):
    """Detect if a given file contains code"""
    kind = filetype.guess(file_path)

    if kind is None:
        return "Unknown file type"

    extracted_text = ""

    if kind.mime.startswith("image"):
        extracted_text = extract_text_from_image(file_path)
    elif kind.mime == "application/pdf":
        extracted_text = extract_text_from_pdf(file_path)
    else:
        return f"Unsupported file type: {kind.mime}"

    contains_code = is_code(extracted_text)

    return {
        "file": file_path,
        "contains_code": contains_code,
        "extracted_text": extracted_text[:500]  # Show first 500 chars as preview
    }

# Example Usage
if __name__ == "__main__":
    file_path = "/content/godaddy.png"  # Change this to your file path
    result = detect_code_in_file(file_path)
    print(result)


{'file': '/content/godaddy.png', 'contains_code': True, 'extracted_text': '© GoDaddy\n\nSign in\n\nNew to GoDaddy? Create an Account\n\n \n\nUsername or Customer # *\n\n \n\n \n\nPassword *\n\n \n\n \n\n \n\nLl Keep me signed in on this device\n\nor sign in with\n\n@||9|/|G\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\nNeed to find your username or you!\n\n \n\x0c'}


In [11]:
!pip install filetype


Collecting filetype
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: filetype
Successfully installed filetype-1.2.0


In [9]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3


In [3]:
!sudo apt install tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 20 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (3,985 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [5]:
!pip install pytesseract


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!python -m pip show pytesseract pymupdf pillow filetype pygments


[0mName: pytesseract
Version: 0.3.13
Summary: Python-tesseract is a python wrapper for Google's Tesseract-OCR
Home-page: https://github.com/madmaze/pytesseract
Author: Samuel Hoffstaetter
Author-email: samuel@hoffstaetter.com
License: Apache License 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: packaging, Pillow
Required-by: 
---
Name: pillow
Version: 11.1.0
Summary: Python Imaging Library (Fork)
Home-page: https://python-pillow.github.io
Author: 
Author-email: "Jeffrey A. Clark" <aclark@aclark.net>
License: MIT-CMU
Location: /usr/local/lib/python3.11/dist-packages
Requires: 
Required-by: bokeh, diffusers, dopamine_rl, fastai, imageio, imgaug, matplotlib, pytesseract, scikit-image, sentence-transformers, torchvision, wordcloud
---
Name: Pygments
Version: 2.18.0
Summary: Pygments is a syntax highlighting package written in Python.
Home-page: https://pygments.org
Author: 
Author-email: Georg Brandl <georg@python.org>
License: BSD-2-Clause
Location: /usr/local/lib/pytho