In [None]:
pip install pymupdf pillow pytesseract scikit-learn

In [None]:
import fitz              # PyMuPDF
import pytesseract
from PIL import Image
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import sys


# -------------------------------------------------
# Clean text
# -------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# -------------------------------------------------
# Extract text from PDF
# -------------------------------------------------
def extract_from_pdf(path):
    doc = fitz.open(path)
    text = ""

    for page in doc:
        text += page.get_text() + "\n"

    return clean_text(text)


# -------------------------------------------------
# Extract text from image (OCR)
# -------------------------------------------------
def extract_from_image(path):
    img = Image.open(path)
    text = pytesseract.image_to_string(img)
    return clean_text(text)


# -------------------------------------------------
# Extract text from TXT file
# -------------------------------------------------
def extract_from_text(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return clean_text(f.read())


# -------------------------------------------------
# Detect file type + route to correct extractor
# -------------------------------------------------
def extract_resume(path):
    if path.endswith(".pdf"):
        return extract_from_pdf(path)

    elif path.lower().endswith((".png", ".jpg", ".jpeg")):
        return extract_from_image(path)

    elif path.endswith(".txt"):
        return extract_from_text(path)

    else:
        print("‚ùå Unsupported resume file type.")
        sys.exit(1)


def extract_job_description(path):
    if path.endswith(".pdf"):
        return extract_from_pdf(path)
    else:
        return extract_from_text(path)


# -------------------------------------------------
# Calculate ATS Score
# -------------------------------------------------
def calculate_ats_score(resume_text, job_text):
    docs = [resume_text, job_text]
    cv = CountVectorizer(stop_words="english")

    vectors = cv.fit_transform(docs)
    similarity = cosine_similarity(vectors[0], vectors[1])[0][0]

    score = round(similarity * 100, 2)

    # Missing keywords
    resume_words = set(resume_text.split())
    job_words = set(job_text.split())
    missing = list(job_words - resume_words)

    return score, missing


# -------------------------------------------------
# Main program
# -------------------------------------------------
if __name__ == "__main__":
    resume_path = input("Enter path to resume (pdf/image/txt): ")
    job_path = input("Enter path to job description (pdf/txt): ")

    print("\nExtracting resume text...")
    resume_text = extract_resume(resume_path)

    print("Extracting job description text...")
    job_text = extract_job_description(job_path)

    print("\nCalculating ATS score...")
    score, missing_keywords = calculate_ats_score(resume_text, job_text)

    print("\n-----------------------------------------")
    print(f"‚úÖ ATS Score: {score}%")
    print("-----------------------------------------")

    print("\n‚ùó Missing Keywords:")
    if missing_keywords:
        print(", ".join(missing_keywords))
    else:
        print("No missing keywords! Great match üéâ")

    print("\nDone.")


## Streamlit code

Below is a **complete Streamlit project** you can copy-paste and run.
It will:

‚úî Upload a **resume** (PDF, image, or text)
‚úî Upload a **job description**
‚úî Extract text from resume using **PyMuPDF** (PDF) + **Tesseract OCR** (images) + plain text
‚úî Compare both texts
‚úî Give an **ATS score** based on keyword match
‚úî Show missing keywords

---

# ‚úÖ **Streamlit ATS Resume Matcher ‚Äî Full Project**

## üìÅ **Project Structure**

```
ats_app/
‚îÇ
‚îú‚îÄ‚îÄ app.py
‚îú‚îÄ‚îÄ requirements.txt
‚îî‚îÄ‚îÄ README.md  (optional)
```

---

# üìå **requirements.txt**

```
streamlit
pymupdf
pytesseract
pillow
numpy
scikit-learn
```

> **Important:** You must install Tesseract OCR externally:

* Windows: [https://github.com/UB-Mannheim/tesseract/wiki](https://github.com/UB-Mannheim/tesseract/wiki)
* Mac: `brew install tesseract`
* Linux: `sudo apt install tesseract-ocr`

---

# üìå **app.py (FULL WORKING STREAMLIT APP)**

```python
import streamlit as st
import fitz  # PyMuPDF
from PIL import Image
import pytesseract
import io
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

st.set_page_config(page_title="ATS Resume Matcher", layout="wide")


# ------------------------------------
# TEXT CLEANING
# ------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[\n\t]", " ", text)
    text = re.sub(r"[^a-zA-Z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# ------------------------------------
# EXTRACT TEXT FROM PDF
# ------------------------------------
def extract_from_pdf(uploaded_file):
    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
    full_text = ""

    for page in doc:
        full_text += page.get_text("text") + "\n"

    return clean_text(full_text)


# ------------------------------------
# EXTRACT TEXT FROM IMAGE (OCR)
# ------------------------------------
def extract_from_image(uploaded_file):
    img = Image.open(uploaded_file)
    text = pytesseract.image_to_string(img)
    return clean_text(text)


# ------------------------------------
# EXTRACT TEXT FROM TXT FILE
# ------------------------------------
def extract_from_text(uploaded_file):
    text = uploaded_file.read().decode("utf-8", errors="ignore")
    return clean_text(text)


# ------------------------------------
# CALCULATE ATS SCORE
# ------------------------------------
def calculate_ats_score(resume_text, job_text):
    documents = [resume_text, job_text]
    cv = CountVectorizer(stop_words='english')
    
    word_count_vector = cv.fit_transform(documents)
    similarity = cosine_similarity(word_count_vector[0], word_count_vector[1])[0][0]

    score = round(similarity * 100, 2)

    resume_words = set(resume_text.split())
    job_words = set(job_text.split())
    missing_keywords = list(job_words - resume_words)

    return score, missing_keywords


# ------------------------------------
# STREAMLIT UI
# ------------------------------------
st.title("üìÑ ATS Resume Matcher")
st.write("Upload a resume + job description to compute your ATS match score.")

col1, col2 = st.columns(2)

with col1:
    resume_file = st.file_uploader("Upload Resume (PDF/Image/TXT)", type=["pdf", "png", "jpg", "jpeg", "txt"])

with col2:
    job_file = st.file_uploader("Upload Job Description (TXT/PDF)", type=["txt", "pdf"])

if resume_file and job_file:
    st.subheader("üì• Extracting Text...")

    # Resume Extraction
    if resume_file.type == "application/pdf":
        resume_text = extract_from_pdf(resume_file)
    elif resume_file.type.startswith("image"):
        resume_text = extract_from_image(resume_file)
    else:
        resume_text = extract_from_text(resume_file)

    # Job Extraction
    if job_file.type == "application/pdf":
        job_text = extract_from_pdf(job_file)
    else:
        job_text = extract_from_text(job_file)

    st.success("Text extracted successfully!")

    # Show extracted text
    with st.expander("üìÑ Extracted Resume Text"):
        st.write(resume_text[:3000])

    with st.expander("üìù Extracted Job Description"):
        st.write(job_text[:3000])

    # Calculate ATS Score
    st.subheader("üìä ATS Match Score")
    score, missing_keywords = calculate_ats_score(resume_text, job_text)

    st.metric("Your ATS Score", f"{score}%")

    st.subheader("‚ùó Missing Keywords")
    if missing_keywords:
        st.write(", ".join(missing_keywords))
    else:
        st.write("Great! No missing keywords üéâ")


st.markdown("---")
st.write("Built with ‚ù§Ô∏è using Streamlit + PyMuPDF + OCR")


```

---

# üöÄ **How to Run the App**

### 1Ô∏è‚É£ Install dependencies

```bash
pip install -r requirements.txt
```

### 2Ô∏è‚É£ Run Streamlit app

```bash
streamlit run app.py
```

---

# üéØ WANT MORE FEATURES?

I can add:

‚úÖ Keyword weighting
‚úÖ PDF ‚Üí DOCX conversion
‚úÖ Resume formatting score
‚úÖ AI-based rewrite suggestions
‚úÖ Section detection (skills, experience, summary)
‚úÖ GDPR-safe anonymizer
‚úÖ Multi-job comparison

Just tell me!
