In [1]:
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
import fitz  # PyMuPDF
import spacy

# 📦 Load model and tokenizer
model_dir = r"C:\Users\sagni\Downloads\Resume Selector\resume_model"
tokenizer = BertTokenizerFast.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 🔥 Define label mapping
id2label = {0: 'Data Science', 1: 'HR', 2: 'Advocate', 3: 'Arts', 4: 'Web Designing',
            5: 'Mechanical Engineer', 6: 'Sales', 7: 'Health and fitness', 8: 'Civil Engineer',
            9: 'Java Developer', 10: 'Business Analyst', 11: 'SAP Developer',
            12: 'Automation Testing', 13: 'Electrical Engineering', 14: 'Operations Manager',
            15: 'Python Developer', 16: 'DevOps Engineer', 17: 'Network Security Engineer',
            18: 'PMO', 19: 'Database', 20: 'Hadoop', 21: 'ETL Developer',
            22: 'DotNet Developer', 23: 'Blockchain', 24: 'Testing'}

# 🗝️ Role-specific keywords
role_keywords = {
    'Data Science': ['python', 'machine learning', 'data analysis', 'pandas', 'scikit-learn', 'tensorflow', 'statistics', 'deep learning'],
    'Python Developer': ['python', 'flask', 'django', 'api', 'sql', 'algorithms', 'oop', 'git'],
    'Java Developer': ['java', 'spring', 'hibernate', 'j2ee', 'maven', 'rest api', 'microservices'],
    'DevOps Engineer': ['docker', 'kubernetes', 'ci/cd', 'jenkins', 'aws', 'terraform', 'ansible'],
    'Web Designing': ['html', 'css', 'javascript', 'responsive design', 'bootstrap'],
    'Automation Testing': ['selenium', 'pytest', 'test automation', 'ci/cd', 'jmeter'],
    # Add more roles & keywords as needed
}

# 🧹 Preprocess function
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(str(text).lower())
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# 📄 Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text.strip()

# 🔮 Predict and analyze
def predict_resume_from_pdf(pdf_path, threshold=0.85):
    text = extract_text_from_pdf(pdf_path)
    if not text:
        print("❌ No text found in the PDF!")
        return
    
    print("📄 Extracted Resume Text (first 500 chars):\n", text[:500], "...\n")

    # Preprocess & tokenize
    clean_text = preprocess(text)
    encoding = tokenizer(clean_text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
    encoding = {k: v.to(device) for k, v in encoding.items()}

    # Prediction
    with torch.no_grad():
        logits = model(**encoding).logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        pred_idx = torch.argmax(torch.tensor(probs)).item()
        confidence = probs[pred_idx]
        predicted_category = id2label[pred_idx]

    print(f"🔮 Predicted Category: {predicted_category}")
    print(f"📈 Confidence Score: {confidence * 100:.2f}%")

    # 🎯 Keyword analysis
    if predicted_category in role_keywords:
        resume_words = set(clean_text.split())
        keywords = set(role_keywords[predicted_category])
        present_keywords = resume_words & keywords
        missing_keywords = keywords - resume_words

        print(f"\n✅ Found keywords ({len(present_keywords)}): {', '.join(present_keywords) if present_keywords else 'None'}")
        if missing_keywords:
            print(f"⚠️ Missing important keywords ({len(missing_keywords)}): {', '.join(missing_keywords)}")
        else:
            print("🎉 All key skills for this role are present!")
    else:
        print("ℹ️ No keyword analysis available for this role.")

    # 💡 Suggestion if confidence is low
    if confidence < threshold:
        print("\n⚠️ Suggestion: Confidence is below threshold.")
        print("🔧 Consider adding more role-specific keywords or projects to improve prediction.")
    else:
        print("✅ Resume looks strong for this role!")

# 📥 Example Usage
pdf_resume_path = r"C:\Users\sagni\Downloads\Resume NextWave\Resume.pdf"
predict_resume_from_pdf(pdf_resume_path)


📄 Extracted Resume Text (first 500 chars):
 Sagnik Patra
 +91-8972252624
# sagnik.patra2000@gmail.com
§ github.com/sagnik1-patra
Master of Technology
ï linkedin.com/in/sagnik2212
Indian Institute of Information Technology, Tiruchirappalli
Education
•Bachelor of Technology in Computer Science and Engineering
2018 – 2022
Jalpaiguri Government Engineering College
CGPA: 8.21
•Master of Technology in Computer Science and Engineering
2023 – 2025
Indian Institute of Information Technology, Tiruchirappalli
CGPA: 8.06
Personal Projects
•Locating  ...

🔮 Predicted Category: Python Developer
📈 Confidence Score: 34.67%

✅ Found keywords (5): git, flask, api, python, oop
⚠️ Missing important keywords (3): sql, django, algorithms

⚠️ Suggestion: Confidence is below threshold.
🔧 Consider adding more role-specific keywords or projects to improve prediction.
