1. Removed the summarization mode and related code.
2. Simplified skill extraction with a lightweight regex approach.
3. Removed must-have/nice-to-have filters—now purely semantic matching with visible extracted skills.
4. Refactored the “Clear Data” button to delete vectors only (indexes remain).


In [None]:
import os
import tempfile
import streamlit as st
import pandas as pd
from pinecone import Pinecone
import pinecone as pcone
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryMemory
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

# ─── CONFIG & SETUP ────────────────────────────────────────────────────────────
os.environ["PINECONE_API_KEY"] = "pcsk_6ANMxB_NBF6TZziCKrn6kWNDskfdQzUj5GU7AJYtFWkWwsRefuXBdrJxRSxrvRe1Y2Nbi2"  # ← replace with your key
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


RESUME_INDEX = "resume-index"
JD_INDEX     = "jd-index"

# Vector store connections
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
resume_store   = PineconeVectorStore(index_name=RESUME_INDEX, embedding=embedding_model)
jd_store       = PineconeVectorStore(index_name=JD_INDEX,     embedding=embedding_model)

# ─── LLM & MEMORY-ENABLED CHAIN ────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model     = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
pipe      = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=256)
llm       = HuggingFacePipeline(pipeline=pipe)

summary_memory = ConversationSummaryMemory(
    llm=llm,
    memory_key="chat_history",
    return_messages=True
)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=resume_store.as_retriever(search_kwargs={"filter": {"doc_type": "resume"}}),
    memory=summary_memory,
    return_source_documents=True
)

# ─── SKILL EXTRACTION (simplified) ─────────────────────────────────────────────
def extract_skills_simple(text):
    import re
    pattern = r"\b[A-Za-z0-9\-\+]+\b"
    tokens = re.findall(pattern, text)
    return [t for t in tokens if len(t) >= 3 and (any(c.isdigit() for c in t) or t.lower() != t)]

# ─── STREAMLIT UI ───────────────────────────────────────────────────────────────
st.set_page_config(page_title="AI Resume Matcher", page_icon="🧠", layout="wide")
st.title("🧠 AI Resume Matcher & Chat")

mode = st.radio(
    "🏷 Select Mode:",
    ["Recruiter: JD → Resumes", "Candidate: Resume → JDs", "Chat with Memory"],
    horizontal=True
)

# Sidebar cleanup
with st.sidebar:
    st.markdown("## 🧹 Clear Vectors")
    with st.expander("⚠️ Remove all stored vectors (indexes remain)"):
        st.warning("Deletes all vectors but keeps index config.")
        if st.button("🗑️ Clear Data"):
            for idx in [RESUME_INDEX, JD_INDEX]:
                ix = pcone.Index(idx)
                ix.delete(delete_all=True)
            st.success("✅ All vectors cleared.")

# ─── RECRUITER MODE ───────────────────────────────────────────────────────────
if mode == "Recruiter: JD → Resumes":
    st.header("📥 Upload & Index Resumes")
    uploaded = st.file_uploader("Upload .txt/.pdf resumes", type=["txt","pdf"], accept_multiple_files=True)
    if uploaded:
        for f in uploaded:
            with tempfile.NamedTemporaryFile(delete=False, suffix="."+f.name.split(".")[-1]) as tmp:
                tmp.write(f.getvalue()); path = tmp.name
            loader = TextLoader(path) if f.name.endswith(".txt") else PyPDFLoader(path)
            docs = loader.load(); candidate = f.name.rsplit(".",1)[0]
            for doc in docs:
                doc.metadata.update({
                    "candidate_name": candidate,
                    "doc_type": "resume",
                    "source_file": f.name,
                    "skills": extract_skills_simple(doc.page_content[:1000])
                })
            chunks = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
            chunks = chunks.split_documents(docs)
            PineconeVectorStore.from_documents(documents=chunks, embedding=embedding_model, index_name=RESUME_INDEX)
            st.success(f"Indexed resume: {f.name}")

    st.header("📥 Upload & Index Job Descriptions")
    jds = st.file_uploader("Upload .txt JDs", type=["txt"], accept_multiple_files=True, key="jd_up")
    if jds:
        from langchain.schema import Document
        for f in jds:
            text = f.read().decode("utf-8")
            doc  = Document(page_content=text, metadata={
                "jd_name": f.name,
                "doc_type": "jd",
                "source_file": f.name
            })
            chunks = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
            chunks = chunks.split_documents([doc])
            PineconeVectorStore.from_documents(documents=chunks, embedding=embedding_model, index_name=JD_INDEX)
            st.success(f"Indexed JD: {f.name}")

    st.header("📄 Match Candidates by Job Description")
    jd_text = st.text_area("Paste a Job Description here:")
    if jd_text.strip():
        retr = resume_store.as_retriever(search_kwargs={"filter":{"doc_type":"resume"}})
        jd_vec = embedding_model.embed_query(jd_text)
        docs   = retr.get_relevant_documents(jd_text)
        st.subheader("🏆 Top Matches")
        for i, doc in enumerate(docs[:5]):
            score = cosine_similarity([jd_vec], [embedding_model.embed_query(doc.page_content)])[0][0]
            st.markdown(f"**{i+1}. {doc.metadata['candidate_name']}** — {score:.3f}")
            st.markdown(f"🔧 Skills: {', '.join(doc.metadata.get('skills', []))}")
            with st.expander("📄 Snippet"):
                st.write(doc.page_content[:300] + "...")

# ─── CANDIDATE MODE ───────────────────────────────────────────────────────────
elif mode == "Candidate: Resume → JDs":
    st.header("🔁 Upload Your Resume to Find Matching Jobs")
    resume_u = st.file_uploader("Upload .txt/.pdf", type=["txt","pdf"], key="rev")
    if resume_u:
        with tempfile.NamedTemporaryFile(delete=False, suffix="."+resume_u.name.split(".")[-1]) as tmp:
            tmp.write(resume_u.getvalue()); path=tmp.name
        loader = TextLoader(path) if resume_u.name.endswith(".txt") else PyPDFLoader(path)
        text = loader.load()[0].page_content
        retr = jd_store.as_retriever(search_kwargs={"filter":{"doc_type":"jd"}})
        jd_docs = retr.get_relevant_documents(text)
        st.subheader("🏅 Top Matching JDs")
        for i,doc in enumerate(jd_docs[:5]):
            st.markdown(f"**{i+1}. {doc.metadata.get('jd_name','')}**")
            with st.expander("📄 Preview JD"):
                st.write(doc.page_content.strip())

# ─── CHAT MODE ───────────────────────────────────────────────────────────────
elif mode == "Chat with Memory":
    st.header("💬 Chat about your data")
    if st.button("🧹 Clear Chat History"):
        summary_memory.clear(); st.success("Chat history cleared!")
    query = st.text_input("Ask a question:")
    if query.strip():
        res = qa_chain.invoke({"question": query})
        st.markdown("### 🤖 Answer")
        st.write(res["answer"])
        if "source_documents" in res:
            st.markdown("#### 📂 Sources")
            for i,doc in enumerate(res["source_documents"]):
                name = doc.metadata.get('candidate_name') or doc.metadata.get('jd_name') or doc.metadata.get('source_file','—')
                st.markdown(f"{i+1}. {name}")
                with st.expander("📝 Preview"):
                    st.write(doc.page_content[:200] + "...")
    st.markdown("### 📝 Conversation Summary")
    st.write(summary_memory.buffer)
    st.markdown("### 🗂️ Full Chat History")
    for msg in summary_memory.chat_memory.messages:
        role = "You" if msg.type == "human" else "Bot"
        st.markdown(f"**{role}:** {msg.content}")