1. Resume upload + HuggingFace-based skill extraction
2. JD/query match with score and skill display
3. Skill-based filtering
4. CSV download of matched candidates
5. Dashboard-style top 3 ranking

In [None]:
import os
import tempfile
import streamlit as st
import pandas as pd
from pinecone import Pinecone
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

# --- INIT ---
os.environ["PINECONE_API_KEY"] = "pcsk_6ANMxB_NBF6TZziCKrn6kWNDskfdQzUj5GU7AJYtFWkWwsRefuXBdrJxRSxrvRe1Y2Nbi2"
pc = Pinecone()
index_name = "resume-index"

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embedding_model)

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=256)
llm = HuggingFacePipeline(pipeline=pipe)

retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True, chain_type="stuff")

ner_model = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

def extract_skills_hf(text):
    entities = ner_model(text)
    return [ent["word"] for ent in entities if ent["entity_group"] in ["ORG", "MISC"]]

# --- STREAMLIT CONFIG ---
st.set_page_config(page_title="AI Resume Matcher", page_icon="🧠")
st.title("🧠 AI Resume Matcher with HuggingFace + Pinecone")

# --- SIDEBAR: RESET + FILTER ---
st.sidebar.markdown("## 🧹 Cleanup Tools")
with st.sidebar.expander("⚠️ Reset Pinecone Resume Index"):
    st.warning("This will permanently delete all stored resumes from Pinecone and reset the index.")
    confirm = st.checkbox("I understand and want to proceed")
    password = st.text_input("Enter admin password to confirm:", type="password")

    if confirm and password:
        if password == "123":
            if st.button("❌ Confirm Reset"):
                with st.spinner("Resetting Pinecone index..."):
                    pc.delete_index(index_name)
                    pc.create_index(
                        name=index_name,
                        dimension=384,
                        metric="cosine",
                        spec={"cloud": "aws", "region": "us-east-1"}
                    )
                    st.success("✅ Index cleared.")
        else:
            st.error("🚫 Incorrect password.")

# --- Resume Upload ---
st.markdown("### 📤 Upload Resumes")
uploaded_files = st.file_uploader("Upload one or more .txt or .pdf resumes", type=["txt", "pdf"], accept_multiple_files=True)

if uploaded_files:
    for uploaded_file in uploaded_files:
        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
            tmp_file.write(uploaded_file.getvalue())
            file_path = tmp_file.name

        loader = TextLoader(file_path) if uploaded_file.name.endswith(".txt") else PyPDFLoader(file_path)
        candidate = uploaded_file.name.rsplit(".", 1)[0]
        docs = loader.load()

        for doc in docs:
            doc.metadata["candidate_name"] = candidate
            skill_text = doc.page_content[:1000]
            extracted_skills = extract_skills_hf(skill_text)
            doc.metadata["skills"] = list(set(extracted_skills))

        chunks = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50).split_documents(docs)

        PineconeVectorStore.from_documents(
            documents=chunks,
            embedding=embedding_model,
            index_name=index_name
        )

        st.success(f"✅ `{uploaded_file.name}` uploaded and indexed.")

# --- Skill Filter ---
all_skills = ["Python", "Docker", "Streamlit", "LangChain", "Pinecone", "Transformers", "Kubernetes"]
selected_skill = st.sidebar.selectbox("🎯 Filter by Skill", ["Show All"] + all_skills)

# --- JD Match (with ranking) ---
st.markdown("### 📄 Match Candidates by Job Description")
jd_text = st.text_area("Paste a job description here:")

if jd_text:
    jd_vector = embedding_model.embed_query(jd_text)
    search_results = vectorstore.similarity_search_with_score(jd_text, k=5)

    scored_candidates = []
    for doc, _ in search_results:
        doc_vector = embedding_model.embed_query(doc.page_content)
        score = cosine_similarity([jd_vector], [doc_vector])[0][0]
        scored_candidates.append((doc, score))

    scored_candidates.sort(key=lambda x: x[1], reverse=True)

    # 🎯 Top 3 Dashboard
    st.markdown("### 🏆 Top Matches")
    for i, (doc, score) in enumerate(scored_candidates[:3]):
        st.markdown(f"**{i+1}. Candidate:** `{doc.metadata.get('candidate_name')}`")
        st.markdown(f"🔧 Skills: `{', '.join(doc.metadata.get('skills', []))}`")
        st.markdown(f"📊 Score: `{score:.3f}``")
        st.markdown("---")

    # 📂 Full List (Filtered)
    st.markdown("### 📂 All Matches (Filtered)")
    export_data = []
    for i, (doc, score) in enumerate(scored_candidates):
        doc_skills = doc.metadata.get("skills", [])
        if selected_skill != "Show All" and selected_skill not in doc_skills:
            continue

        st.markdown(f"**{i+1}. Candidate:** `{doc.metadata.get('candidate_name')}`")
        st.markdown(f"🔧 Skills: {', '.join(doc_skills) if doc_skills else '—'}")
        with st.expander("📄 Snippet"):
            st.write(doc.page_content.strip())

        export_data.append({
            "Rank": i+1,
            "Candidate": doc.metadata.get("candidate_name"),
            "Skills": ", ".join(doc_skills),
            "Score": f"{score:.3f}",
            "Snippet": doc.page_content[:250]
        })

    if export_data:
        df = pd.DataFrame(export_data)
        csv = df.to_csv(index=False).encode('utf-8')
        st.download_button("📥 Download Matches (CSV)", csv, "matched_candidates.csv", "text/csv")

# --- Direct Search by Query (optional)
st.markdown("### 🔍 Search by Tool or Experience")
query = st.text_input("Enter a skill or tool:")

if query:
    response = qa_chain.invoke({"query": query})
    st.markdown("### 🤖 Answer:")
    st.write(response["result"])

    st.markdown("### 📂 Matches:")
    for i, doc in enumerate(response["source_documents"]):
        doc_skills = doc.metadata.get("skills", [])
        if selected_skill != "Show All" and selected_skill not in doc_skills:
            continue
        st.markdown(f"**{i+1}. Candidate:** `{doc.metadata.get('candidate_name')}`")
        st.markdown(f"🔧 Skills: {', '.join(doc_skills) if doc_skills else '—'}")
        with st.expander("📄 Snippet"):
            st.write(doc.page_content.strip())

You now have:

    •	🔥 Skill-aware filtering
	•	📊 Top 3 dashboard
	•	📥 CSV downloads
	•	📂 Query and JD-based semantic search