In [None]:
import os
import re
from pathlib import Path
from langdetect import detect
from docx import Document
from PIL import Image
import pytesseract
import fitz  
import json
from google import genai
from google.genai import types
import dotenv
from datetime import datetime

# ---------------------- CONFIG ----------------------
dotenv.load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

RAW_DIR    = "cvs_data/raw_cvs"
FINAL_JSON = "cvs_data/all_cvs.json"
LOG_FILE   = "logs/logs.txt"

# Create necessary folders
for d in [RAW_DIR, "cvs_data/final", "logs"]:
    Path(d).mkdir(parents=True, exist_ok=True)

# Tesseract path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# ---------------------- HELPERS ----------------------
def redact_pii(text):
    text = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '[REDACTED_EMAIL]', text)
    text = re.sub(r'\b\d{10,}\b', '[REDACTED_PHONE]', text)
    return text

def extract_text_pdf(path):
    try:
        doc = fitz.open(path)
        full_text = ""
        for page in doc:
            t = page.get_text()
            if t.strip():
                full_text += t
            else:
                pix = page.get_pixmap(dpi=300)
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                full_text += pytesseract.image_to_string(img, lang="eng")
        return full_text
    except Exception as e:
        return f"ERROR: {e}"

def extract_text_docx(path):
    try:
        doc = Document(path)
        return "\n".join([p.text for p in doc.paragraphs])
    except Exception as e:
        return f"ERROR: {e}"

def detect_language_safe(text):
    try:
        return detect(text)
    except:
        return "unknown"

# ---------------------- GEMINI PARSING ----------------------
client = genai.Client(api_key=GOOGLE_API_KEY)

def parse_cv_with_gemini(text):
    prompt = f"""
    Extract structured CV information in this JSON format:

    {{
      "education": [{{"degree":"","field":"","university":"","country":"","start":null,"end":null,"gpa":null,"scale":null}}],
      "experience": [{{"title":"","org":"","start":null,"end":null,"duration_months":null,"domain":""}}],
      "publications": [{{"title":"","venue":"","year":null,"type":"","authors":[],"author_position":null,"journal_if":null,"domain":""}}],
      "awards": [{{"title":"","issuer":"","year":null,"type":""}}]
    }}

    RULES:
    1. For experience, if end date missing, set "end": currently working and calculate "duration_months" from start to current date.
    2. For education, include only Bachelor's or univeristy level degree or higher or .
    3. Return ONLY valid JSON.
    """
    resp = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt + text,
        config=types.GenerateContentConfig(temperature=0)
    )
    raw = resp.text.strip()
    try:
        return json.loads(raw)
    except:
        # Attempt minor repair
        c = raw[raw.find("{"): raw.rfind("}")+1]
        c = re.sub(r',\s*([}\]])', r'\1', c)
        try:
            return json.loads(c)
        except:
            return {"education": [], "experience": [], "publications": [], "awards": []}

# ---------------------- MAIN PIPELINE ----------------------
log = []
final_data = []
rank = 1

for filename in os.listdir(RAW_DIR):
    path = os.path.join(RAW_DIR, filename)
    text = ""
    
    if filename.endswith(".pdf"):
        text = extract_text_pdf(path)
    elif filename.endswith(".docx"):
        text = extract_text_docx(path)
    else:
        log.append(f"{filename}: unsupported format")
        continue

    if not text.strip() or text.startswith("ERROR"):
        log.append(f"{filename}: extraction failed")
        continue

    if detect_language_safe(text) != "en":
        log.append(f"{filename}: non-English")
        continue

    # Remove PII
    text = redact_pii(text)

    # Parse structured CV
    structured = parse_cv_with_gemini(text)

    final_data.append({
        "name": filename,
        "number": rank,
        **structured
    })
    rank += 1
    log.append(f"{filename}: processed successfully")

# Save final JSON
with open(FINAL_JSON, "w", encoding="utf-8") as f:
    json.dump(final_data, f, indent=2)

# Save logs
with open(LOG_FILE, "w", encoding="utf-8") as f:
    f.write("\n".join(log))

print("\nðŸŽ‰ Pipeline completed!")
print(f"Final JSON: {FINAL_JSON}")
print(f"Logs: {LOG_FILE}")



ðŸŽ‰ Pipeline completed!
Final JSON: cvs_data/all_cvs.json
Logs: logs/logs.txt
