Import Libraries

In [1]:
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import nltk
import re
import os

Define Extraction and Parsing Functions

In [3]:
def extract_png_text(file_path):
    """Extract text from a PNG file with optimized OCR."""
    try:
        image = Image.open(file_path)
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(2)
        image = image.filter(ImageFilter.SHARPEN)
        text = pytesseract.image_to_string(image)
        return re.sub(r'\s+', ' ', text.strip())
    except Exception as e:
        return f"Error: {str(e)}"

def parse_cv(text):
    """Parse CV text for qualifications, skills, and experience."""
    tokens = nltk.word_tokenize(text.lower())
    print("Tokens:", tokens)

    qualifications = ['law', 'legal', 'jurisprudence', 'degree', 'master', 'bachelor', 'llb', 'jd']
    skills = ['negotiation', 'contract', 'research', 'analysis', 'drafting', 'litigation', 'compliance']
    experience = ['years', 'experience', 'worked', 'firm', 'firms', 'practice']

    found_qualifications = [token for token in tokens if token in qualifications]
    found_skills = [token for token in tokens if token in skills]
    found_experience = [token for token in tokens if token in experience]

    return {
        'qualifications': list(set(found_qualifications)),
        'skills': list(set(found_skills)),
        'experience': list(set(found_experience)),
        'raw_text': text
    }

Test with Juriste CV

In [5]:
cv_path = "../uploads/juriste.png"
cv_text = extract_png_text(cv_path)
print("Extracted Text:", cv_text)

parsed_cv = parse_cv(cv_text)
print("Parsed CV:", parsed_cv)

Extracted Text: John Doe Juriste Qualifications: Law Degree, - Jurisprudence ‘Skills: Contract tiation, Leg; “Research “ ars in law firms i> — ee
Tokens: ['john', 'doe', 'juriste', 'qualifications', ':', 'law', 'degree', ',', '-', 'jurisprudence', '‘', 'skills', ':', 'contract', 'tiation', ',', 'leg', ';', '“', 'research', '“', 'ars', 'in', 'law', 'firms', 'i', '>', '—', 'ee']
Parsed CV: {'qualifications': ['law', 'degree', 'jurisprudence'], 'skills': ['research', 'contract'], 'experience': ['firms'], 'raw_text': 'John Doe Juriste Qualifications: Law Degree, - Jurisprudence ‘Skills: Contract tiation, Leg; “Research “ ars in law firms i> — ee'}


Experiment with OCR Variations

In [8]:
def test_ocr_variations(file_path):
    image = Image.open(file_path)
    variations = [
        ("Normal", image),
        ("High Contrast", ImageEnhance.Contrast(image).enhance(3)),
        ("Sharpened", image.filter(ImageFilter.SHARPEN))
    ]
    results = {}
    for name, img in variations:
        text = pytesseract.image_to_string(img)
        results[name] = re.sub(r'\s+', ' ', text.strip())
    return results

ocr_results = test_ocr_variations(cv_path)
for name, text in ocr_results.items():
    print(f"{name} OCR Text:", text)

Normal OCR Text: John Doe Juriste Qualifications: Law Degree, Jurisprudence Me Sentra tiation, Leg esearch E i : ars in law firms
High Contrast OCR Text: John Doe Juriste } Qualifications: Law Degree, | Jurisprudence ills: Contract tiation, Legi Research ~ ars in law firms
Sharpened OCR Text: John Doe Juriste Qualifications: Law Degree, i Jurisprudence | Skills: Contract tiation, Leg; esearch ." Expe : ars in law firms
