In [None]:
import os
import tempfile
import streamlit as st
import pandas as pd
from pinecone import Pinecone
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationSummaryMemory
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity

# ─── CONFIG & SETUP ────────────────────────────────────────────────────────────
os.environ["PINECONE_API_KEY"] = "your-pinecone-api-key"  # ← replace with your key
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

RESUME_INDEX = "resume-index"
JD_INDEX     = "jd-index"

# Vector store connections
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
resume_store   = PineconeVectorStore(index_name=RESUME_INDEX, embedding=embedding_model)
jd_store       = PineconeVectorStore(index_name=JD_INDEX,     embedding=embedding_model)

# ─── LLM & MEMORY-ENABLED CHAIN ────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model     = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
pipe      = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=256)
llm       = HuggingFacePipeline(pipeline=pipe)

summary_memory = ConversationSummaryMemory(
    llm=llm,
    memory_key="chat_history",
    return_messages=True
)

qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=resume_store.as_retriever(search_kwargs={"filter": {"doc_type": "resume"}}),
    memory=summary_memory,
    return_source_documents=True
)

# ─── SKILL EXTRACTION ──────────────────────────────────────────────────────────
ner_model = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
def extract_skills_hf(text):
    ents = ner_model(text)
    return list({e["word"] for e in ents if e["entity_group"] in ["ORG", "MISC"]})

# ─── SUMMARIZER ─────────────────────────────────────────────────────────────────
@st.cache_resource
def load_summarizer():
    return pipeline("summarization", model="google/flan-t5-base")

summarizer = load_summarizer()

# ─── STREAMLIT UI ───────────────────────────────────────────────────────────────
st.set_page_config(page_title="AI Resume Matcher & NLP Toolkit", page_icon="🧠", layout="wide")
st.title("🧠 AI Resume Matcher & NLP Toolkit")

# Mode selector
mode = st.radio(
    "🏷 Select Mode:",
    [
        "Recruiter: JD → Resumes",
        "Candidate: Resume → JDs",
        "Chat with Memory",
        "Summarize Text"
    ],
    horizontal=True
)

# Sidebar: reset & filters
with st.sidebar:
    st.markdown("## 🧹 Cleanup Tools")
    with st.expander("⚠️ Reset Indexes"):
        st.warning("Deletes and recreates both Pinecone indexes.")
        confirm = st.checkbox("I understand")
        pwd     = st.text_input("Admin password:", type="password")
        if confirm and pwd == "123" and st.button("Confirm Reset"):
            with st.spinner("Resetting indexes..."):
                for idx in [RESUME_INDEX, JD_INDEX]:
                    pc.delete_index(idx)
                    pc.create_index(name=idx, dimension=384, metric="cosine",
                                    spec={"cloud":"aws","region":"us-east-1"})
                st.success("✅ Indexes reset.")
        elif confirm and pwd:
            st.error("❌ Wrong password")

    st.markdown("## 🎯 Global Skill Filter")
    ALL_SKILLS     = ["Python","Docker","Streamlit","LangChain","Pinecone","Transformers","Kubernetes"]
    selected_skill = st.selectbox("Filter candidates by skill:", ["Show All"] + ALL_SKILLS)

# ─── RECRUITER MODE: JD → Resumes ───────────────────────────────────────────────
if mode == "Recruiter: JD → Resumes":
    st.header("📥 Upload & Index Resumes")
    resumes = st.file_uploader("Upload resumes (.txt/.pdf)", type=["txt","pdf"], accept_multiple_files=True)
    if resumes:
        for f in resumes:
            with tempfile.NamedTemporaryFile(delete=False, suffix="."+f.name.split(".")[-1]) as tmp:
                tmp.write(f.getvalue()); path = tmp.name
            loader = TextLoader(path) if f.name.endswith(".txt") else PyPDFLoader(path)
            docs = loader.load()
            candidate = f.name.rsplit(".",1)[0]
            for doc in docs:
                doc.metadata.update({
                    "candidate_name": candidate,
                    "doc_type": "resume",
                    "source_file": f.name,
                    "skills": extract_skills_hf(doc.page_content[:1000])
                })
            chunks = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50).split_documents(docs)
            PineconeVectorStore.from_documents(documents=chunks, embedding=embedding_model,
                                               index_name=RESUME_INDEX)
            st.success(f"Indexed resume: {f.name}")

    st.header("📥 Upload & Index Job Descriptions")
    jds = st.file_uploader("Upload JDs (.txt)", type=["txt"], accept_multiple_files=True, key="jd_up")
    if jds:
        from langchain.schema import Document
        for f in jds:
            text = f.read().decode("utf-8")
            doc  = Document(page_content=text, metadata={
                "jd_name": f.name,
                "doc_type": "jd",
                "source_file": f.name
            })
            chunks = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50).split_documents([doc])
            PineconeVectorStore.from_documents(documents=chunks, embedding=embedding_model,
                                               index_name=JD_INDEX)
            st.success(f"Indexed JD: {f.name}")

    st.header("📄 Match Candidates by Job Description")
    jd_text = st.text_area("Paste a Job Description here:")
    if jd_text:
        if st.button("🧠 Extract Skills"):
            jd_skills = extract_skills_hf(jd_text)
            st.success("Extracted: " + ", ".join(jd_skills))
        jd_skills = locals().get("jd_skills", [])
        edit = st.text_input("✏️ Edit JD skills (comma-separated):")
        if edit:
            jd_skills = [s.strip() for s in edit.split(",")]

        st.subheader("🧪 Skill Filtering")
        must_have    = st.multiselect("✅ Must-Have", jd_skills, default=jd_skills[:2])
        nice_to_have = st.multiselect("👍 Nice-to-Have", [s for s in jd_skills if s not in must_have])

        retr = resume_store.as_retriever(search_kwargs={
            "filter": {"doc_type":"resume", "skills":{"$in": jd_skills}}
        })
        jd_vec = embedding_model.embed_query(jd_text)
        docs   = retr.get_relevant_documents(jd_text)

        scored = []
        for doc in docs:
            cos = cosine_similarity([jd_vec], [embedding_model.embed_query(doc.page_content)])[0][0]
            ds  = doc.metadata["skills"]
            if must_have and not set(must_have).issubset(ds):
                continue
            ov = (len(set(must_have)&ds) + 0.5*len(set(nice_to_have)&ds)) / \
                 (len(must_have)+len(nice_to_have)) if (must_have or nice_to_have) else 0
            final = 0.7*cos + 0.3*ov
            scored.append((doc, final))

        scored.sort(key=lambda x: x[1], reverse=True)

        st.subheader("🏆 Top 3 Matches")
        for i,(doc,score) in enumerate(scored[:3]):
            st.markdown(f"**{i+1}. [{doc.metadata['doc_type']}] {doc.metadata['candidate_name']}** — {score:.3f}")
            st.markdown(f"🔧 Skills: {', '.join(doc.metadata['skills'])}")
            st.markdown("---")

        st.subheader("📂 All Matches")
        export = []
        for i,(doc,score) in enumerate(scored):
            ds = doc.metadata["skills"]
            if selected_skill != "Show All" and selected_skill not in ds:
                continue
            st.markdown(f"**{i+1}. [{doc.metadata['doc_type']}] {doc.metadata['candidate_name']}** — {score:.3f}")
            st.markdown(f"🔧 Skills: {', '.join(ds)}")
            with st.expander("📝 Preview"):
                st.write(doc.page_content[:300] + "...")
            export.append({
                "Rank": i+1,
                "Candidate": doc.metadata["candidate_name"],
                "Skills": ",".join(ds),
                "Score": f"{score:.3f}",
                "Source": doc.metadata["source_file"]
            })
        if export:
            df  = pd.DataFrame(export)
            csv = df.to_csv(index=False).encode("utf-8")
            st.download_button("📥 Download CSV", csv, "matches.csv", "text/csv")

# ─── CANDIDATE MODE: Resume → JDs ───────────────────────────────────────────────
elif mode == "Candidate: Resume → JDs":
    st.header("🔁 Upload Your Resume to Find Matching Jobs")
    resume_u = st.file_uploader("Upload your resume (.txt/.pdf)", type=["txt","pdf"], key="rev")
    if resume_u:
        with tempfile.NamedTemporaryFile(delete=False, suffix="."+resume_u.name.split(".")[-1]) as tmp:
            tmp.write(resume_u.getvalue()); path = tmp.name
        loader = TextLoader(path) if resume_u.name.endswith(".txt") else PyPDFLoader(path)
        text    = loader.load()[0].page_content
        retr    = jd_store.as_retriever(search_kwargs={"filter":{"doc_type":"jd"}})
        jd_docs = retr.get_relevant_documents(text)
        st.subheader("🏅 Top Matching JDs")
        for i,doc in enumerate(jd_docs[:5]):
            st.markdown(f"**{i+1}. [{doc.metadata['doc_type']}] {doc.metadata.get('jd_name','')}**")
            with st.expander("📄 Preview JD"):
                st.write(doc.page_content.strip())

# ─── CHAT MODE: Conversational QA w/ Memory ────────────────────────────────────
elif mode == "Chat with Memory":
    st.header("💬 Conversational QA with Summary Memory")
    if st.button("🧹 Clear Chat History"):
        summary_memory.clear()
        st.success("Chat history cleared!")
    query = st.text_input("Ask a question:")
    if query:
        res = qa_chain.invoke({"question": query})
        st.markdown("### 🤖 Answer")
        st.write(res["answer"])
        if "source_documents" in res:
            st.markdown("#### 📂 Sources")
            for i,doc in enumerate(res["source_documents"]):
                meta = doc.metadata
                name = meta.get("candidate_name") or meta.get("jd_name") or meta.get("source_file","—")
                st.markdown(f"{i+1}. [{meta.get('doc_type','N/A').upper()}] {name}")
                with st.expander("📝 Preview"):
                    st.write(doc.page_content[:200] + "...")
    st.markdown("### 📝 Conversation Summary")
    st.write(summary_memory.buffer)
    st.markdown("### 🗂️ Full Chat History")
    for msg in summary_memory.chat_memory.messages:
        role = "🧑‍💻 You" if msg.type=="human" else "🤖 Bot"
        st.markdown(f"**{role}:** {msg.content}")

# ─── SUMMARIZE MODE: Text Summarization ────────────────────────────────────────
else:  # mode == "Summarize Text"
    st.header("📝 Smart Summarizer (FLAN-T5)")
    input_text = st.text_area("Paste text to summarize:", height=200)
    min_len    = st.slider("Min summary length", 10, 150, 30)
    max_len    = st.slider("Max summary length", 50, 300, 100)
    if st.button("🧠 Generate Summary"):
        if input_text.strip():
            with st.spinner("Summarizing..."):
                out = summarizer("summarize: " + input_text,
                                 max_length=max_len, min_length=min_len, do_sample=False)
                st.markdown("### ✨ Summary")
                st.success(out[0]["summary_text"])
        else:
            st.warning("Please enter text to summarize.")