In [None]:
import re
import pytesseract
from pdf2image import convert_from_path
import os

# Input PDF and output files
PDF_FILE = "contacts.pdf"  # Change this to your actual PDF filename
OUTPUT_VCF = "contacts.vcf"

# Convert PDF pages to images for OCR processing
print("🔄 Converting PDF to images...")
images = convert_from_path(PDF_FILE)

# Extract text using OCR
print("🔍 Running OCR on extracted images...")
ocr_text = "\n".join(pytesseract.image_to_string(img) for img in images)

# Save extracted text for debugging
with open("extracted_contacts.txt", "w", encoding="utf-8") as text_file:
    text_file.write(ocr_text)

print("🔍 Extracted Text from OCR:")
print(ocr_text)

# **Patterns for Filtering**
PHONE_PATTERN = re.compile(r"\+?\d{7,15}")  # Match valid phone numbers
EMAIL_PATTERN = re.compile(r"\S+@\S+\.\S+")  # Detect emails
RANDOM_NUM_PATTERN = re.compile(r"^\d{4,}$")  # Detect random 4+ digit numbers

# **Unwanted Words List**
EXCLUDED_KEYWORDS = {
    "note", "main", "phone", "mobile", "telegram", "home", "birthday",
    "partner", "email", "lj", "fax", "work", "tel", "add", "contact", "LJ"
}

# **Step 1: Pre-Clean the Text Before Processing**
cleaned_lines = []
for line in ocr_text.split("\n"):
    line = line.strip()

    # Remove junk:
    if (not line or any(word in line.lower() for word in EXCLUDED_KEYWORDS) or
        EMAIL_PATTERN.match(line) or RANDOM_NUM_PATTERN.match(line)):
        continue

    cleaned_lines.append(line)

# **Step 2: Process Extracted Contacts**
contacts = []
current_contact = {"first_name": "X", "last_name": "", "phones": []}

for line in cleaned_lines:
    phones = PHONE_PATTERN.findall(line)

    if phones:
        # **It's a phone number → Store in current contact**
        current_contact["phones"].extend(phones)
    else:
        # **It's a name → Save the previous contact (if valid) & start a new one**
        if current_contact["last_name"] and current_contact["phones"]:
            contacts.append(current_contact)

        current_contact = {"first_name": "X", "last_name": line, "phones": []}

# **Step 3: Save Last Contact**
if current_contact["last_name"] and current_contact["phones"]:
    contacts.append(current_contact)

# **Step 4: Final Cleanup of Contacts**
final_contacts = []
for contact in contacts:
    last_name = contact["last_name"].strip()
    valid_phones = list(dict.fromkeys(contact["phones"]))[:3]  # Keep max 3 unique numbers

    if last_name and valid_phones:
        final_contacts.append({
            "first_name": "X",
            "last_name": last_name,
            "phones": valid_phones
        })

# Debug: Print cleaned contacts
print("\n✅ Final Cleaned Contacts:")
for c in final_contacts:
    print(c)

# **Step 5: Generate VCF File**
print("📁 Generating VCF file...")
with open(OUTPUT_VCF, "w", encoding="utf-8") as vcf_file:
    for contact in final_contacts:
        vcf_file.write("BEGIN:VCARD\n")
        vcf_file.write("VERSION:3.0\n")
        vcf_file.write(f"FN:{contact['first_name']} {contact['last_name']}\n")
        vcf_file.write(f"N:{contact['first_name']};{contact['last_name']};;;\n")

        phone_labels = ["mobile", "2nd mobile", "3rd mobile"]
        for i, phone in enumerate(contact["phones"]):
            vcf_file.write(f"TEL;TYPE={phone_labels[i]}:{phone}\n")

        vcf_file.write("END:VCARD\n\n")

print(f"✅ Successfully generated {OUTPUT_VCF}. You can now import it into iCloud.")
