In [5]:
# ✅ Imports
import os
import time
import json
import google.generativeai as genai

# ✅ Configure API key
genai.configure(api_key="AIzaSyBkQlQeGjKwjCqZ2pwBubBVW6_KVxskEZg")

# ✅ Gemini 1.5 or 2.5 Experimental model
model = genai.GenerativeModel(model_name="gemini-2.0-flash-lite")  # use "gemini-1.5-flash" if needed

# ✅ Folders
txt_folder = "resumes-txt"
json_folder = "parsed-json"
os.makedirs(json_folder, exist_ok=True)

# ✅ STEP 4: Build prompt with section normalization + formatting instructions
def build_prompt(resume_text):
    return f"""
You are an expert resume parser. Extract the following structured fields from a resume.
Return ONLY a valid JSON object with the following schema:

{{
  "header": {{
    "name": "",
    "email": "",
    "phone": "",
    "linkedin": "",
    "address": "",
    "website": ""
  }},
  "summary": "Extract the candidate's entire professional summary exactly as it appears in the resume. Do not truncate or rewrite. Preserve the full meaning and formatting. If it is written as one paragraph, return it as-is. If it is written as multiple paragraphs, preserve all of them using `\\n\\n` as the paragraph break. Return the full content exactly, not a shortened version.",
  "experience": [
    {{
      "company": "Company Name",
      "job_title": "Job Title",
      "start_date": "Month Year",
      "end_date": "Month Year or 'Present'",
      "responsibilities": [
        "Each bullet may contain multiple sentences if they belong together. Do NOT split one bullet into many."
      ]
    }}
  ],
  "education": [
  {{
    "institution": "Name of the school or university (e.g., University of Texas)",
    "degree": "Formal degree (e.g., Bachelor of Computer Science)",
    "field_of_study": "Field (e.g., AI, Marketing) if available",
    "start_date": "Month Year",
    "end_date": "Month Year or 'Present'",
    "location": "City, State (if available)"
  }}
],
  "skills": [
    "Preserve skill categories if available (e.g., 'Tech Stack: Java/J2EE, PL/SQL'). Do not break category lines into individual flat skills."
  ],
  "projects": [],
  "certifications": [],
  "achievements": [],
  "organizations": [],
  "hobbies": [],
  "miscellaneous": {{
    "Extracurricular Activities": "...",
    "Volunteering Highlights": "..."
  }}
}}

Section Normalization Instructions:

- Normalize varying section headings into standard JSON fields:
  - 'website', 'portfolio link', portfolio' → website
  - 'Professional Summary', 'Profile', 'Career Overview' → summary
  - 'Work History', 'Professional Journey', 'Experience' → experience
  - 'Education', 'Academic Background' → education
  - 'Certifications', 'Training', 'Licenses' → certifications or education
  - 'Skills', 'Technical Skills', 'Tech Stack' → skills
  - 'Projects', 'Key Projects', 'Freelance Work' → projects
  - 'Achievements', 'Awards' → achievements
  - 'Organizations', 'Volunteer', 'Extracurriculars' → organizations
  - 'Hobbies', 'Interests', 'Personal Activities' → hobbies

If a section doesn’t fit any of the above, store it in a 'miscellaneous' object with section title as key.
If any field is missing (e.g. no LinkedIn or website), return an empty string `""` — not null, undefined, or "N/A". Always include the keys, but leave the value blank if missing.
Respond with STRICTLY valid JSON only. Escape all internal quotes inside strings. Do not omit braces, commas, or close tags.

------------------
Resume Text:
{resume_text}
"""

# ✅ STEP 5: Gemini API Call with JSON cleaning and retry logic
def send_to_gemini(prompt, max_retries=5):
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            text = response.text.strip()
            if text.startswith("```json"):
                text = text.split("```json")[1].split("```")[0].strip()
            elif text.startswith("```"):
                text = text.split("```")[1].split("```")[0].strip()
            parsed = json.loads(text)
            return parsed

        except Exception as e:
            error_message = str(e)
            if "429" in error_message:
                delay_match = re.search(r"retry_delay\s*{\s*seconds:\s*(\d+)", error_message)
                delay_seconds = int(delay_match.group(1)) if delay_match else 60
                print(f"❌ 429 Rate limit hit. Waiting {delay_seconds}s before retry...")
                time.sleep(delay_seconds)
            else:
                print(f"❌ Error (attempt {attempt+1}): {e}")
                time.sleep(5)
    raise RuntimeError("❌ Failed after multiple retries.")

# ✅ STEP 6: Loop through resumes
txt_files = sorted(f for f in os.listdir(txt_folder) if f.endswith(".txt") and not f.startswith("."))

for idx, txt_filename in enumerate(txt_files, start=1):
    txt_path = os.path.join(txt_folder, txt_filename)
    json_path = os.path.join(json_folder, f"resume_{idx:04}.json")

    if os.path.exists(json_path):
        print(f"✔️ Skipping resume_{idx:04} — already processed.")
        continue

    with open(txt_path, "r", encoding="utf-8") as f:
        resume_text = f.read()

    prompt = build_prompt(resume_text)

    try:
        print(f"🔄 Processing resume_{idx:04}...")
        parsed_output = send_to_gemini(prompt)

        # ✅ Optional: Log miscellaneous sections if any
        extras = parsed_output.get("miscellaneous", {})
        if extras:
            print(f"👀 Found extra sections in resume_{idx:04}: {list(extras.keys())}")

        # ✅ Save JSON
        with open(json_path, "w", encoding="utf-8") as out:
            json.dump(parsed_output, out, indent=2)

        print(f"✅ Saved: resume_{idx:04}.json")
        time.sleep(3)  # global rate control

    except Exception as e:
        print(f"❌ Failed on resume_{idx:04}: {e}")
        break  # or `continue` if you want to skip and go on

✔️ Skipping resume_0001 — already processed.
✔️ Skipping resume_0002 — already processed.
✔️ Skipping resume_0003 — already processed.
✔️ Skipping resume_0004 — already processed.
✔️ Skipping resume_0005 — already processed.
✔️ Skipping resume_0006 — already processed.
✔️ Skipping resume_0007 — already processed.
✔️ Skipping resume_0008 — already processed.
✔️ Skipping resume_0009 — already processed.
✔️ Skipping resume_0010 — already processed.
✔️ Skipping resume_0011 — already processed.
✔️ Skipping resume_0012 — already processed.
✔️ Skipping resume_0013 — already processed.
✔️ Skipping resume_0014 — already processed.
✔️ Skipping resume_0015 — already processed.
✔️ Skipping resume_0016 — already processed.
✔️ Skipping resume_0017 — already processed.
✔️ Skipping resume_0018 — already processed.
✔️ Skipping resume_0019 — already processed.
✔️ Skipping resume_0020 — already processed.
✔️ Skipping resume_0021 — already processed.
✔️ Skipping resume_0022 — already processed.
✔️ Skippin

In [3]:
# ✅ STEP 1: Imports
import os
import time
import json
import re
import json5
import google.generativeai as genai

# ✅ STEP 2: Configure Gemini API
genai.configure(api_key="AIzaSyBkQlQeGjKwjCqZ2pwBubBVW6_KVxskEZg")
model = genai.GenerativeModel(model_name="gemini-2.0-flash-lite")  # Or "gemini-1.5-flash"

# ✅ STEP 3: Folder setup
txt_folder = "resumes-txt"
json_folder = "parsed-json"
os.makedirs(json_folder, exist_ok=True)

# ✅ STEP 3.5: Utility Functions
def clean_llm_output(text):
    # Remove code block markers
    if text.startswith("```json"):
        text = text.split("```json")[1].split("```")[0].strip()
    elif text.startswith("```"):
        text = text.split("```")[1].split("```")[0].strip()

    # Balance braces
    open_braces = text.count("{")
    close_braces = text.count("}")
    if close_braces < open_braces:
        text += "}" * (open_braces - close_braces)

    return text

def try_safe_json_parse(raw_text):
    try:
        return json.loads(raw_text)
    except Exception:
        print("⚠️ json.loads failed. Trying json5...")
        return json5.loads(raw_text)

def clean_nulls(obj):
    if isinstance(obj, dict):
        return {k: clean_nulls(v) for k, v in obj.items() if v is not None}
    elif isinstance(obj, list):
        return [clean_nulls(v) for v in obj]
    return obj

# ✅ STEP 4: Prompt Builder
def build_prompt(resume_text):
    return f"""
You are an expert resume parser. Extract the following structured fields from a resume.
Return ONLY a valid JSON object with the following schema:

{{
  "header": {{
    "name": "",
    "email": "",
    "phone": "",
    "linkedin": "",
    "address": "",
    "website": ""
  }},
  "summary": "Extract the candidate's entire professional summary exactly as it appears in the resume. Do not truncate or rewrite. Preserve the full meaning and formatting. If it is written as one paragraph, return it as-is. If it is written as multiple paragraphs, preserve all of them using \\n\\n as the paragraph break. Return the full content exactly, not a shortened version.",
  "experience": [
    {{
      "company": "Company Name",
      "job_title": "Job Title",
      "start_date": "Month Year",
      "end_date": "Month Year or 'Present'",
      "responsibilities": [
        "Each bullet may contain multiple sentences if they belong together. Do NOT split one bullet into many."
      ]
    }}
  ],
  "education": [
    {{
      "institution": "Name of the school or university (e.g., University of Texas)",
      "degree": "Formal degree (e.g., Bachelor of Computer Science)",
      "field_of_study": "Field (e.g., AI, Marketing) if available",
      "start_date": "Month Year",
      "end_date": "Month Year or 'Present'",
      "location": "City, State (if available)"
    }}
  ],
  "skills": [
    "Preserve skill categories if available (e.g., 'Tech Stack: Java/J2EE, PL/SQL'). Do not break category lines into individual flat skills."
  ],
  "projects": [],
  "certifications": [],
  "achievements": [],
  "organizations": [],
  "hobbies": [],
  "miscellaneous": {{
    "Extracurricular Activities": "...",
    "Volunteering Highlights": "..."
  }}
}}

Section Normalization Instructions:

- Normalize varying section headings into standard JSON fields:
  - 'website', 'portfolio link', 'portfolio' → website
  - 'Professional Summary', 'Profile', 'Career Overview' → summary
  - 'Work History', 'Professional Journey', 'Experience' → experience
  - 'Education', 'Academic Background' → education
  - 'Certifications', 'Training', 'Licenses' → certifications or education
  - 'Skills', 'Technical Skills', 'Tech Stack' → skills
  - 'Projects', 'Key Projects', 'Freelance Work' → projects
  - 'Achievements', 'Awards' → achievements
  - 'Organizations', 'Volunteer', 'Extracurriculars' → organizations
  - 'Hobbies', 'Interests', 'Personal Activities' → hobbies

If a section doesn’t fit any of the above, store it in a 'miscellaneous' object with section title as key.
If any field is missing (e.g. no LinkedIn or website), return an empty string "" — not null, undefined, or "N/A". Always include the keys, but leave the value blank if missing.
Respond with STRICTLY valid JSON only. Escape all internal quotes inside strings. Do not omit braces, commas, or close tags.

------------------
Resume Text:
{resume_text}
"""

# ✅ STEP 5: Gemini API Wrapper
def send_to_gemini(prompt, idx=None, max_retries=5):
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            raw = response.text.strip()
            cleaned = clean_llm_output(raw)
            parsed = try_safe_json_parse(cleaned)
            return clean_nulls(parsed)

        except Exception as e:
            error_message = str(e)
            if "429" in error_message:
                delay_match = re.search(r"retry_delay\\s*{{\\s*seconds:\\s*(\\d+)", error_message)
                delay_seconds = int(delay_match.group(1)) if delay_match else 60
                print(f"❌ 429 Rate limit hit. Waiting {delay_seconds}s before retry...")
                time.sleep(delay_seconds)
            else:
                print(f"❌ Error (attempt {attempt+1}): {e}")
                time.sleep(5)

    if idx is not None:
        try:
            with open(f"failed_raw_resume_{idx:04}.txt", "w", encoding="utf-8") as raw_log:
                raw_log.write(raw)
        except Exception as log_err:
            print(f"⚠️ Could not write raw log for resume {idx:04}: {log_err}")

    return None  # <- this is important now


# ✅ STEP 6: Resume Loop (filename-driven index + failure recovery)
import re  # Make sure this is imported at the top of your notebook

txt_files = sorted(f for f in os.listdir(txt_folder) if f.endswith(".txt") and not f.startswith("."))
failed_resumes = []

for txt_filename in txt_files:
    # Extract index directly from filename like resume_0073.txt
    match = re.search(r"resume_(\d+)\.txt", txt_filename)
    if not match:
        print(f"⚠️ Skipping invalid filename: {txt_filename}")
        continue
    idx = int(match.group(1))

    txt_path = os.path.join(txt_folder, txt_filename)
    json_path = os.path.join(json_folder, f"resume_{idx:04}.json")

    if os.path.exists(json_path):
        print(f"✔️ Skipping resume_{idx:04} — already processed.")
        continue

    with open(txt_path, "r", encoding="utf-8") as f:
        resume_text = f.read()

    prompt = build_prompt(resume_text)

    print(f"🔄 Processing resume_{idx:04}...")
    parsed_output = send_to_gemini(prompt, idx=idx)

    if parsed_output is None:
        print(f"⚠️ Skipping resume_{idx:04} due to persistent Gemini failure.")
        failed_resumes.append(f"resume_{idx:04}")
        continue

    try:
        extras = parsed_output.get("miscellaneous", {})
        if extras:
            print(f"👀 Found extra sections in resume_{idx:04}: {list(extras.keys())}")

        with open(json_path, "w", encoding="utf-8") as out:
            json.dump(parsed_output, out, indent=2)

        print(f"✅ Saved: resume_{idx:04}.json")
        time.sleep(3)

    except Exception as e:
        print(f"❌ Failed to write JSON for resume_{idx:04}: {e}")
        failed_resumes.append(f"resume_{idx:04}")
        with open("error_log.txt", "a", encoding="utf-8") as log:
            log.write(f"resume_{idx:04} - JSON write error: {e}\n")
        continue

# ✅ Log failures after processing
if failed_resumes:
    print("\n🚫 The following resumes failed and were skipped:")
    for failed in failed_resumes:
        print(f"  - {failed}")
    with open("final_failed_resumes.txt", "w", encoding="utf-8") as f:
        for failed in failed_resumes:
            f.write(f"{failed}\n")
else:
    print("✅ All resumes processed successfully.")


✔️ Skipping resume_0002 — already processed.
✔️ Skipping resume_0003 — already processed.
✔️ Skipping resume_0004 — already processed.
✔️ Skipping resume_0005 — already processed.
✔️ Skipping resume_0006 — already processed.
✔️ Skipping resume_0007 — already processed.
✔️ Skipping resume_0008 — already processed.
✔️ Skipping resume_0009 — already processed.
✔️ Skipping resume_0010 — already processed.
✔️ Skipping resume_0011 — already processed.
✔️ Skipping resume_0012 — already processed.
✔️ Skipping resume_0013 — already processed.
✔️ Skipping resume_0014 — already processed.
✔️ Skipping resume_0015 — already processed.
✔️ Skipping resume_0016 — already processed.
✔️ Skipping resume_0017 — already processed.
✔️ Skipping resume_0018 — already processed.
✔️ Skipping resume_0019 — already processed.
✔️ Skipping resume_0020 — already processed.
✔️ Skipping resume_0021 — already processed.
✔️ Skipping resume_0022 — already processed.
✔️ Skipping resume_0023 — already processed.
✔️ Skippin