1.	Skill overlap scoring
2.	Auto JD skill extraction
3.	Must-have vs nice-to-have filtering
4.	Candidate profile cards
5.	Reverse JD matching (two-sided mode)

In [None]:
import os
import tempfile
import streamlit as st
import pandas as pd
from pinecone import Pinecone, ServerlessSpec
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

# --- CONFIG & SETUP ---
os.environ["PINECONE_API_KEY"] = "pcsk_6ANMxB_NBF6TZziCKrn6kWNDskfdQzUj5GU7AJYtFWkWwsRefuXBdrJxRSxrvRe1Y2Nbi2"  # <-- Replace with your key
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Resume index
RESUME_INDEX = "resume-index"
# JD index
JD_INDEX = "jd-index"


# Embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# VectorStores
resume_store = PineconeVectorStore(index_name=RESUME_INDEX, embedding=embedding_model)
jd_store     = PineconeVectorStore(index_name=JD_INDEX,     embedding=embedding_model)

# LLM for QA / query responses
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model     = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
pipe      = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=256)
llm       = HuggingFacePipeline(pipeline=pipe)

# Retrieval chain for recruiter mode
resume_retriever = resume_store.as_retriever()
qa_chain         = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=resume_retriever,
    return_source_documents=True,
    chain_type="stuff"
)

# NER model for skill extraction
ner_model = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def extract_skills_hf(text):
    ents = ner_model(text)
    return list({e["word"] for e in ents if e["entity_group"] in ["ORG", "MISC"]})

def extract_skills_from_text(text):
    # same as above but alias for JD
    return extract_skills_hf(text)

# --- STREAMLIT UI ---
st.set_page_config(page_title="AI Resume Matcher", page_icon="🧠", layout="wide")
st.title("🧠 AI Resume Matcher with HuggingFace + Pinecone")

# Mode switcher
mode = st.radio(
    "🧭 Select Mode:",
    ["Recruiter: Match JD → Resumes", "Candidate: Match Resume → JDs"],
    horizontal=True
)

# Sidebar: Cleanup Tools and Skill Filter
with st.sidebar:
    st.markdown("## 🧹 Cleanup Tools")
    with st.expander("⚠️ Reset Indexes"):
        st.warning("This will permanently delete and recreate both Pinecone indexes.")
        confirm = st.checkbox("I understand and want to proceed")
        pwd     = st.text_input("Admin password:", type="password")
        if confirm and pwd:
            if pwd == "123":
                if st.button("❌ Confirm Full Reset"):
                    with st.spinner("Resetting indexes..."):
                        for idx in [RESUME_INDEX, JD_INDEX]:
                            pc.delete_index(idx)
                            pc.create_index(
                                name=idx,
                                dimension=384,
                                metric="cosine",
                                spec={"cloud": "aws", "region": "us-east-1"}
                            )
                        st.success("✅ Both indexes cleared and recreated.")
            else:
                st.error("🚫 Incorrect password.")

    st.markdown("## 🎯 Global Skill Filter")
    # Define a superset or derive from JD store later
    ALL_SKILLS = ["Python","Docker","Streamlit","LangChain","Pinecone","Transformers","Kubernetes"]
    selected_skill = st.selectbox("Filter candidates by skill:", ["Show All"] + ALL_SKILLS)

# --- RECRUITER MODE: JD → Resumes ---
if mode == "Recruiter: Match JD → Resumes":
    st.header("📥 Upload & Index Resumes")
    uploaded = st.file_uploader("Upload .txt or .pdf resumes", type=["txt","pdf"], accept_multiple_files=True)
    if uploaded:
        for f in uploaded:
            with tempfile.NamedTemporaryFile(delete=False, suffix="."+f.name.split(".")[-1]) as tmp:
                tmp.write(f.getvalue())
                path = tmp.name
            loader = TextLoader(path) if f.name.endswith(".txt") else PyPDFLoader(path)
            docs = loader.load()
            candidate = f.name.rsplit(".",1)[0]
            for doc in docs:
                doc.metadata["candidate_name"] = candidate
                snippet = doc.page_content[:1000]
                doc.metadata["skills"] = extract_skills_hf(snippet)
            chunks = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50).split_documents(docs)
            PineconeVectorStore.from_documents(documents=chunks, embedding=embedding_model, index_name=RESUME_INDEX)
            st.success(f"Indexed {f.name}")

    st.header("📥 Upload & Index Job Descriptions")
    jd_files = st.file_uploader("Upload .txt JDs", type=["txt"], accept_multiple_files=True, key="jd_up")
    if jd_files:
        for f in jd_files:
            text = f.read().decode("utf-8")
            from langchain.schema import Document
            doc = Document(page_content=text, metadata={"jd_name": f.name})
            chunks = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50).split_documents([doc])
            PineconeVectorStore.from_documents(documents=chunks, embedding=embedding_model, index_name=JD_INDEX)
            st.success(f"Indexed JD {f.name}")

    st.header("📄 Match Candidates by Job Description")
    jd_text = st.text_area("Paste a Job Description")
    if jd_text:
        # Phase 2: extract JD skills
        if st.button("🧠 Extract Skills from JD"):
            jd_skills = extract_skills_from_text(jd_text)
            st.success("Extracted: " + ", ".join(jd_skills))
        try:
            jd_skills
        except NameError:
            jd_skills = []
        custom = st.text_input("✏️ Add/Edit JD skills (comma-separated):")
        if custom:
            jd_skills = [s.strip() for s in custom.split(",")]

        st.subheader("🧪 Skill Filtering")
        must_have = st.multiselect("✅ Must-Have", jd_skills, default=jd_skills[:2])
        nice     = st.multiselect("👍 Nice-to-Have", [s for s in jd_skills if s not in must_have])

        # embed JD
        jd_vec = embedding_model.embed_query(jd_text)
        # search top chunks
        results = resume_store.similarity_search_with_score(jd_text, k=10)

        scored = []
        for doc, _ in results:
            # semantic
            cos = cosine_similarity([jd_vec], [embedding_model.embed_query(doc.page_content)])[0][0]
            doc_sk = doc.metadata.get("skills", [])
            # filter must-have
            if must_have and not set(must_have).issubset(doc_sk):
                continue
            # overlap scoring
            mh = len(set(must_have)&set(doc_sk))
            nh = len(set(nice)&set(doc_sk))
            ov = (mh + 0.5*nh)/(len(must_have)+len(nice)) if (must_have or nice) else 0
            final = 0.7*cos + 0.3*ov
            scored.append((doc, final))

        scored.sort(key=lambda x: x[1], reverse=True)

        # Top 3 dashboard
        st.subheader("🏆 Top 3 Matches")
        for i, (doc, score) in enumerate(scored[:3]):
            st.markdown(f"**{i+1}. {doc.metadata['candidate_name']}** — Score: {score:.3f}")
            st.markdown(f"🔧 Skills: {', '.join(doc.metadata.get('skills',[]))}")
            st.markdown("---")

        # Full list + CSV export
        st.subheader("📂 All Matches")
        export = []
        for i, (doc, score) in enumerate(scored):
            sks = doc.metadata.get("skills",[])
            if selected_skill!="Show All" and selected_skill not in sks:
                continue
            st.markdown(f"**{i+1}. {doc.metadata['candidate_name']}** — Score: {score:.3f}")
            st.markdown(f"🔧 Skills: {', '.join(sks)}")
            with st.expander("📄 Snippet"):
                st.write(doc.page_content.strip())
            export.append({
                "Rank": i+1,
                "Candidate": doc.metadata["candidate_name"],
                "Skills": ", ".join(sks),
                "Score": f"{score:.3f}",
                "Snippet": doc.page_content[:200]
            })
        if export:
            df = pd.DataFrame(export)
            csv = df.to_csv(index=False).encode("utf-8")
            st.download_button("📥 Download Matches CSV", csv, "matches.csv", "text/csv")

# --- CANDIDATE MODE: Resume → JDs ---
else:
    st.header("🔁 Upload a Resume to Find Matching Jobs")
    rev = st.file_uploader("Upload .txt or .pdf resume", type=["txt","pdf"], key="rev")
    if rev:
        with tempfile.NamedTemporaryFile(delete=False, suffix="."+rev.name.split(".")[-1]) as tmp:
            tmp.write(rev.getvalue())
            path = tmp.name
        loader = TextLoader(path) if rev.name.endswith(".txt") else PyPDFLoader(path)
        text = loader.load()[0].page_content
        vec  = embedding_model.embed_query(text)
        jd_results = jd_store.similarity_search_with_score(text, k=10)

        st.subheader("🏅 Top Matching JDs")
        for i, (jd_doc, score) in enumerate(jd_results[:5]):
            st.markdown(f"**{i+1}. {jd_doc.metadata.get('jd_name')}** — Score: {score:.3f}")
            with st.expander("📄 Preview JD"):
                st.write(jd_doc.page_content.strip())