<a href="https://colab.research.google.com/github/sravs-2211/sravs-final-project/blob/main/final3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Part 1: Setup & Imports

import re
import io
import tempfile
from typing import List, Tuple, Dict, Any

import pandas as pd
import numpy as np
import streamlit as st

# Text extraction
import fitz  # PyMuPDF
import docx2txt

# NLP
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Cache models
@st.cache_resource(show_spinner=False)
def load_spacy_model():
    return spacy.load("en_core_web_sm")

@st.cache_resource(show_spinner=False)
def load_sentence_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

nlp = load_spacy_model()
embedder = load_sentence_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Part 2: Resume & JD Text Extraction

# --- PDF Extraction ---
def extract_text_from_pdf(file_bytes: bytes) -> str:
    text_chunks = []
    with fitz.open(stream=file_bytes, filetype="pdf") as doc:
        for page in doc:
            text_chunks.append(page.get_text("text"))
    return "\n".join(text_chunks)

# --- DOCX Extraction ---
def extract_text_from_docx(file_bytes: bytes) -> str:
    with tempfile.NamedTemporaryFile(delete=True, suffix=".docx") as tf:
        tf.write(file_bytes)
        tf.flush()
        txt = docx2txt.process(tf.name) or ""
    return txt

# --- TXT Extraction ---
def extract_text_from_txt(file_bytes: bytes) -> str:
    try:
        return file_bytes.decode("utf-8", errors="ignore")
    except Exception:
        return str(file_bytes)

# --- Generic File Extraction ---
def extract_text_from_file(uploaded_file) -> Tuple[str, str]:
    """Return (filename, extracted_text)"""
    name = uploaded_file.name
    data = uploaded_file.read()
    lower = name.lower()

    try:
        if lower.endswith(".pdf"):
            text = extract_text_from_pdf(data)
        elif lower.endswith(".docx"):
            text = extract_text_from_docx(data)
        elif lower.endswith(".txt"):
            text = extract_text_from_txt(data)
        else:
            text = extract_text_from_txt(data)  # fallback
    except Exception as e:
        st.warning(f"❌ Failed to extract text from {name}: {e}")
        text = ""

    return name, text

In [12]:
# Part 3: Text Cleaning & Contact Info Extraction

# --- Text Cleaning ---
def clean_text(text: str) -> str:
    if not text:
        return ""
    txt = text.replace("\r\n", "\n").replace("\r", "\n")
    txt = re.sub(r"\n{3,}", "\n\n", txt)   # collapse too many newlines
    txt = re.sub(r"[ \t]{2,}", " ", txt)   # collapse multiple spaces
    txt = txt.replace("\x00", "")          # remove null bytes
    return txt.strip()

# --- Contact Info Extraction ---
def extract_contact_info(text: str) -> Dict[str, str]:
    info = {"email": "", "phone": "", "name": ""}

    # Email regex
    m = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    if m:
        info["email"] = m.group(0)

    # Phone regex (simple, works for many formats)
    m2 = re.search(r"(\+?\d{1,3}[-.\s]?)?(\(?\d{2,4}\)?[-.\s]?)?\d{6,12}", text)
    if m2:
        info["phone"] = m2.group(0)

    # Heuristic: candidate name → first line with 2–4 title-cased words
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    for ln in lines[:10]:  # check first 10 lines
        if 2 <= len(ln.split()) <= 4 and ln[0].isupper():
            if re.match(r"^[A-Z][a-z]+(\s[A-Z][a-z]+)+", ln):
                info["name"] = ln
                break

    return info

In [13]:
# Part 4: Job Description Keyword Extraction

# --- Predefined Skill Dictionary (can expand as needed) ---
SKILL_KEYWORDS = {
    "python", "java", "c++", "sql", "javascript", "html", "css",
    "machine learning", "deep learning", "nlp", "tensorflow", "pytorch",
    "data analysis", "data science", "pandas", "numpy", "scikit-learn",
    "excel", "powerbi", "tableau", "docker", "kubernetes", "git",
    "cloud", "aws", "azure", "gcp", "linux"
}

def extract_keywords_from_jd(jd_text: str) -> Dict[str, Any]:
    """
    Extracts important keywords and skills from a job description.
    Returns dict with 'all_keywords' and 'skills_matched'.
    """
    doc = nlp(jd_text.lower())

    # Candidate keywords (nouns, proper nouns)
    tokens = [
        token.text for token in doc
        if token.pos_ in {"NOUN", "PROPN"} and not token.is_stop
    ]

    # Match predefined skills
    skills_found = set()
    for skill in SKILL_KEYWORDS:
        if skill in jd_text.lower():
            skills_found.add(skill)

    return {
        "all_keywords": list(set(tokens)),
        "skills_matched": list(skills_found)
    }

In [14]:
# Part 5: Candidate Scoring Logic

# --- TF-IDF Similarity ---
def compute_tfidf_score(resume_text: str, jd_text: str) -> float:
    corpus = [jd_text, resume_text]
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(corpus)
    sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return round(sim * 100, 2)  # scale to 0–100

# --- BERT Embedding Similarity ---
def compute_bert_score(resume_text: str, jd_text: str) -> float:
    embeddings = embedder.encode([jd_text, resume_text], convert_to_tensor=True)
    sim = cosine_similarity(
        embeddings[0].cpu().numpy().reshape(1, -1),
        embeddings[1].cpu().numpy().reshape(1, -1)
    )[0][0]
    return round(sim * 100, 2)

# --- Combined Candidate Scoring ---
def score_candidate(resume_text: str, jd_text: str, jd_skills: List[str]) -> Dict[str, Any]:
    cleaned_resume = clean_text(resume_text)
    contact_info = extract_contact_info(cleaned_resume)

    # Similarity scores
    tfidf_score = compute_tfidf_score(cleaned_resume, jd_text)
    bert_score = compute_bert_score(cleaned_resume, jd_text)

    # Skill match percentage
    resume_lower = cleaned_resume.lower()
    matched = [s for s in jd_skills if s in resume_lower]
    skill_score = round((len(matched) / len(jd_skills)) * 100, 2) if jd_skills else 0

    # Weighted final score (40% TF-IDF, 40% BERT, 20% Skills)
    final_score = round((0.4 * tfidf_score) + (0.4 * bert_score) + (0.2 * skill_score), 2)

    return {
        "name": contact_info.get("name", "Unknown"),
        "email": contact_info.get("email", ""),
        "phone": contact_info.get("phone", ""),
        "tfidf_score": tfidf_score,
        "bert_score": bert_score,
        "skill_score": skill_score,
        "final_score": final_score,
        "skills_matched": matched
    }

In [15]:
# Part 6: Streamlit Dashboard

def main():
    st.set_page_config(page_title="Smart Resume Screening System", layout="wide")
    st.title("📄 Smart Resume Screening System")
    st.markdown("Upload resumes and a job description to rank candidates.")

    # --- Job Description Input ---
    st.sidebar.header("Job Description")
    jd_file = st.sidebar.file_uploader("Upload JD file (PDF/DOCX/TXT)", type=["pdf", "docx", "txt"])
    jd_text_area = st.sidebar.text_area("Or paste JD here")

    jd_text = ""
    if jd_file:
        _, jd_text = extract_text_from_file(jd_file)
    elif jd_text_area:
        jd_text = jd_text_area

    if not jd_text:
        st.warning("⚠️ Please upload or paste a job description to continue.")
        return

    jd_text = clean_text(jd_text)
    jd_keywords = extract_keywords_from_jd(jd_text)

    st.sidebar.subheader("Extracted Skills from JD:")
    st.sidebar.write(", ".join(jd_keywords["skills_matched"]) or "None")

    # --- Resume Upload ---
    st.header("Upload Resumes")
    resume_files = st.file_uploader(
        "Upload multiple resumes", type=["pdf", "docx", "txt"], accept_multiple_files=True
    )

    if st.button("Process Resumes") and resume_files:
        results = []

        for file in resume_files:
            fname, text = extract_text_from_file(file)
            candidate = score_candidate(text, jd_text, jd_keywords["skills_matched"])
            candidate["filename"] = fname
            results.append(candidate)

        df = pd.DataFrame(results).sort_values(by="final_score", ascending=False)

        # --- Show Results ---
        st.subheader("📊 Candidate Ranking")
        st.dataframe(df[["filename", "name", "email", "phone", "final_score", "skills_matched"]])

        # Highlight Top Candidate
        top = df.iloc[0]
        st.success(f"🏆 Top Candidate: **{top['name']}** with score {top['final_score']}")

        # Download CSV
        csv = df.to_csv(index=False).encode("utf-8")
        st.download_button("📥 Download Results as CSV", data=csv, file_name="resume_screening_results.csv")

    elif not resume_files:
        st.info("ℹ️ Upload resumes to process.")

# Run app
if __name__ == "__main__":
    main()

2025-09-26 04:46:05.931 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-09-26 04:46:05.949 Session state does not function when running a script without `streamlit run`


In [8]:
%pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9


In [4]:
%pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m81.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.4


In [6]:
%pip install streamlit

