# Streamlit Resume Parser & Matcher
Upload a resume (PDF/DOCX/TXT), extract text, parse structured fields, and optionally enhance parsing via LLMs. Match against a job description.


# Installation (commented)

In [None]:
# !pip install streamlit pypdf docx2txt sentence-transformers openai anthropic google-generativeai groq huggingface_hub


# Imports

In [None]:
import os
import io
import re
import json
import streamlit as st


# File extraction helpers

In [None]:
def extract_text(file) -> str:
    name = file.name.lower()
    data = file.read()
    if name.endswith('.pdf'):
        from pypdf import PdfReader
        reader = PdfReader(io.BytesIO(data))
        return "\n".join([(p.extract_text() or '') for p in reader.pages])
    if name.endswith('.docx'):
        import docx2txt
        bio = io.BytesIO(data)
        # docx2txt expects a file path; workaround: save to temp if needed
        # For simplicity, try in-memory via temporary write (commented for sandbox)
        import tempfile
        with tempfile.NamedTemporaryFile(suffix='.docx', delete=True) as tmp:
            tmp.write(bio.getbuffer())
            tmp.flush()
            return docx2txt.process(tmp.name) or ''
    # fallback for txt
    return data.decode('utf-8', errors='ignore')


# Simple regex-based parsing (baseline)

In [None]:
def parse_baseline(text: str) -> dict:
    email = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    phone = re.search(r"(\+\d{1,3}[- ]?)?\d{10}", text)
    linkedin = re.search(r"https?://(www\.)?linkedin\.com/[A-Za-z0-9_\-/]+", text)
    github = re.search(r"https?://(www\.)?github\.com/[A-Za-z0-9_\-/]+", text)
    name_line = text.strip().splitlines()[0] if text.strip().splitlines() else ''
    return {
        'name': name_line.strip(),
        'email': email.group(0) if email else '',
        'phone': phone.group(0) if phone else '',
        'linkedin': linkedin.group(0) if linkedin else '',
        'github': github.group(0) if github else '',
        'summary': '',
        'skills': [],
        'experience': [],
        'education': [],
        'raw_text': text,
    }


# Optional: LLM-enhanced parsing

In [None]:
def llm_parse(text: str, provider: str, model: str, temperature: float = 0.0, max_tokens: int = 512) -> dict:
    schema = {
        'name': 'string', 'email': 'string', 'phone': 'string', 'location': 'string',
        'summary': 'string', 'skills': ['string'], 'experience': [{'company':'string','title':'string','start':'string','end':'string','desc':'string'}],
        'education': [{'school':'string','degree':'string','year':'string'}], 'links': ['string']
    }
    instruction = (
        "Extract structured resume information as strict JSON matching this schema keys: "
        f"{list(schema.keys())}. Use empty strings/lists when unknown."
    )
    if provider == 'OpenAI':
        from openai import OpenAI
        if (k:=st.secrets.get('OPENAI_API_KEY', None) if hasattr(st,'secrets') else None) or os.environ.get('OPENAI_API_KEY'):
            os.environ['OPENAI_API_KEY'] = k or os.environ.get('OPENAI_API_KEY','')
        client = OpenAI()
        resp = client.chat.completions.create(
            model=model,
            response_format={'type':'json_object'},
            temperature=temperature,
            max_tokens=max_tokens,
            messages=[{"role":"system","content":instruction},{"role":"user","content":text}],
        )
        return json.loads(resp.choices[0].message.content)
    if provider == 'Gemini':
        import google.generativeai as genai
        genai.configure(api_key=st.secrets.get('GEMINI_API_KEY', os.environ.get('GEMINI_API_KEY')))
        g = genai.GenerativeModel(model)
        out = g.generate_content(f"{instruction}\n\n{text}")
        try:
            return json.loads(out.text)
        except Exception:
            return {'error':'Failed to parse JSON from Gemini', 'raw': getattr(out,'text', str(out))}
    if provider == 'Anthropic':
        import anthropic
        a = anthropic.Anthropic()
        out = a.messages.create(model=model, max_tokens=max_tokens, temperature=temperature,
                                messages=[{"role":"user","content":instruction + "\n\n" + text}])
        try:
            return json.loads(out.content[0].text)
        except Exception:
            return {'error':'Failed to parse JSON from Claude', 'raw': out.content[0].text}
    if provider == 'Groq':
        from groq import Groq
        gq = Groq(api_key=st.secrets.get('GROQ_API_KEY', os.environ.get('GROQ_API_KEY')))
        out = gq.chat.completions.create(model=model, temperature=temperature, max_tokens=max_tokens,
            messages=[{"role":"system","content":"Return ONLY valid JSON."},{"role":"user","content":instruction + "\n\n" + text}],
        )
        try:
            return json.loads(out.choices[0].message.content)
        except Exception:
            return {'error':'Failed to parse JSON from Groq', 'raw': out.choices[0].message.content}
    if provider == 'HuggingFace':
        from huggingface_hub import InferenceClient
        hf = InferenceClient(token=st.secrets.get('HUGGINGFACEHUB_API_TOKEN', os.environ.get('HUGGINGFACEHUB_API_TOKEN')))
        try:
            resp = hf.chat_completion(model=model, messages=[{"role":"user","content":instruction + "\n\n" + text}], max_tokens=max_tokens)
            return json.loads(resp.choices[0].message['content'] if hasattr(resp.choices[0],'message') else resp.choices[0]['message']['content'])
        except Exception as e:
            return {'error': f'HF error {e}'}
    raise ValueError('Unsupported provider')


# Matching score (JD â†” resume) using sentence-transformers (optional)

In [None]:
def similarity_score(resume_text: str, jd_text: str) -> float:
    try:
        from sentence_transformers import SentenceTransformer, util
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        e1 = model.encode([resume_text], normalize_embeddings=True)
        e2 = model.encode([jd_text], normalize_embeddings=True)
        return float(util.cos_sim(e1, e2)[0][0])
    except Exception:
        # Fallback: simple token overlap ratio
        rs = set(re.findall(r"\w+", resume_text.lower()))
        js = set(re.findall(r"\w+", jd_text.lower()))
        inter = len(rs & js)
        uni = len(rs | js) or 1
        return inter / uni


# UI

In [None]:
st.set_page_config(page_title="Resume Parser", page_icon="ðŸ“„")
st.title("ðŸ“„ Resume Parser & Matcher")

with st.sidebar:
    st.header('LLM (optional)')
    prov = st.selectbox('Provider', ['None','OpenAI','Gemini','Anthropic','Groq','HuggingFace'], index=0)
    if prov != 'None':
        if prov=='OpenAI':
            model = st.text_input('Model', 'gpt-4o-mini')
        elif prov=='Gemini':
            model = st.text_input('Model', 'gemini-1.5-flash')
        elif prov=='Anthropic':
            model = st.text_input('Model', 'claude-3-5-sonnet-20241022')
        elif prov=='Groq':
            model = st.text_input('Model', 'llama-3.1-8b-instant')
        else:
            model = st.text_input('Model', 'Qwen/Qwen2.5-1.5B-Instruct')
    else:
        model = ''

left, right = st.columns(2)
with left:
    resume_file = st.file_uploader('Upload resume (PDF/DOCX/TXT)', type=['pdf','docx','txt'])
with right:
    jd_text = st.text_area('Paste job description (optional)', height=200)


# Run parsing

In [None]:
if st.button('Parse resume', type='primary') and resume_file is not None:
    text = extract_text(resume_file)
    parsed = parse_baseline(text)
    if prov != 'None':
        try:
            parsed_llm = llm_parse(text, prov, model)
            parsed.update({k: parsed_llm.get(k, parsed.get(k)) for k in parsed_llm if k in parsed})
            parsed['llm'] = parsed_llm
        except Exception as e:
            st.warning(f"LLM parse failed: {e}")
    st.subheader('Parsed JSON')
    st.json(parsed)
    if jd_text.strip():
        score = similarity_score(text, jd_text)
        st.metric('Match score (0-1 ~ cosine)', f"{score:.3f}")
    st.download_button('Download JSON', data=json.dumps(parsed, ensure_ascii=False, indent=2), file_name='resume.json', mime='application/json')


# Notes
# - LLM parsing may incur costs; ensure keys are set in secrets or env.
# - Improve baseline parsing by adding section detection and skill dictionaries.