In [2]:
import os

#  Create project folder
project_dir = r"C:\Users\sagni\Downloads\Resume Selector"
os.makedirs(project_dir, exist_ok=True)

#  Training script
train_code = """
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy

#  Paths
data_path = r"C:\\Users\\sagni\\Downloads\\Resume Selector\\UpdatedResumeDataSet.csv"
model_save_path = r"C:\\Users\\sagni\\Downloads\\Resume Selector\\resume_model"

#  Load dataset
df = pd.read_csv(data_path, encoding='utf-8')
df = df[['Category', 'Resume']]
df.dropna(inplace=True)
print(f"Dataset shape: {df.shape}")

#  Encode labels
labels = df['Category'].unique()
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
df['label'] = df['Category'].map(label2id)

#  Text preprocessing
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(str(text).lower())
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

df['clean_resume'] = df['Resume'].apply(preprocess)

#  Dataset class
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts.iloc[idx], truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {key: val.squeeze(0) for key, val in encodings.items()}, torch.tensor(self.labels.iloc[idx])

#  Tokenizer & Model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

#  Split data
X_train, X_test, y_train, y_test = train_test_split(df['clean_resume'], df['label'], test_size=0.2, random_state=42)
train_dataset = ResumeDataset(X_train, y_train, tokenizer)
test_dataset = ResumeDataset(X_test, y_test, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

#  Training
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels_batch = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels_batch = labels_batch.to(device)

        outputs = model(**inputs, labels=labels_batch)
        loss = outputs.loss
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

#  Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs, labels_batch = batch
        inputs = {k: v.to(device) for k, v in inputs.items()}
        logits = model(**inputs).logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(labels_batch.numpy())

print("\\n Classification Report:\\n", classification_report(y_true, y_pred, target_names=labels))

#  Save model & tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f" Model and tokenizer saved to '{model_save_path}'")
"""

#  Prediction script
predict_code = """
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
import fitz  # PyMuPDF
import spacy

#  Load model & tokenizer
model_dir = r"C:\\Users\\sagni\\Downloads\\Resume Selector\\resume_model"
tokenizer = BertTokenizerFast.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

#  Role-specific keywords
role_keywords = {
    'Data Science': ['python', 'machine learning', 'pandas', 'tensorflow'],
    'Python Developer': ['python', 'flask', 'django', 'api'],
    'Java Developer': ['java', 'spring', 'hibernate'],
}

#  Preprocess function
nlp = spacy.load('en_core_web_sm')
def preprocess(text):
    doc = nlp(str(text).lower())
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

#  Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text.strip()

#  Predict and analyze
def predict_resume_from_pdf(pdf_path, threshold=0.85):
    text = extract_text_from_pdf(pdf_path)
    if not text:
        print(" No text found in the PDF!")
        return
    
    print(" Extracted Resume Text (first 500 chars):\\n", text[:500], "...\\n")

    clean_text = preprocess(text)
    encoding = tokenizer(clean_text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        logits = model(**encoding).logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        pred_idx = torch.argmax(torch.tensor(probs)).item()
        confidence = probs[pred_idx]
        predicted_category = list(role_keywords.keys())[pred_idx]

    print(f" Predicted Category: {predicted_category}")
    print(f" Confidence Score: {confidence * 100:.2f}%")

    #  Keyword Analysis
    resume_words = set(clean_text.split())
    keywords = set(role_keywords.get(predicted_category, []))
    present_keywords = resume_words & keywords
    missing_keywords = keywords - resume_words

    print(f" Found keywords: {', '.join(present_keywords) if present_keywords else 'None'}")
    if missing_keywords:
        print(f" Missing important keywords: {', '.join(missing_keywords)}")
    else:
        print(" All key skills present!")

#  Example Usage
pdf_resume_path = r"C:\\Users\\sagni\\Downloads\\Resume NextWave\\Resume.pdf"
predict_resume_from_pdf(pdf_resume_path)
"""

#  Requirements
requirements = """
torch
transformers
spacy
pandas
scikit-learn
PyMuPDF
"""

# Save files
with open(os.path.join(project_dir, "train_resume_model.py"), "w") as f:
    f.write(train_code.strip())

with open(os.path.join(project_dir, "predict_resume.py"), "w") as f:
    f.write(predict_code.strip())

with open(os.path.join(project_dir, "requirements.txt"), "w") as f:
    f.write(requirements.strip())

print(" All files created in:", project_dir)


 All files created in: C:\Users\sagni\Downloads\Resume Selector


In [3]:
import os

project_dir = r"C:\Users\sagni\Downloads\Resume Selector"
templates_dir = os.path.join(project_dir, "templates")
static_dir = os.path.join(project_dir, "static")
os.makedirs(templates_dir, exist_ok=True)
os.makedirs(static_dir, exist_ok=True)

#  Flask app (app.py)
app_code = """
from flask import Flask, render_template, request, jsonify
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
import fitz  # PyMuPDF
import os

app = Flask(__name__)

# Load model and tokenizer
model_dir = os.path.join(os.path.dirname(__file__), "resume_model")
model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizerFast.from_pretrained(model_dir)
label2id = {'Data Science': 0, 'HR': 1, 'Advocate': 2, 'Arts': 3, 'Web Designing': 4,
            'Mechanical Engineer': 5, 'Sales': 6, 'Health and fitness': 7, 'Civil Engineer': 8,
            'Java Developer': 9, 'Business Analyst': 10, 'SAP Developer': 11, 'Automation Testing': 12,
            'Electrical Engineering': 13, 'Operations Manager': 14, 'Python Developer': 15,
            'DevOps Engineer': 16, 'Network Security Engineer': 17, 'PMO': 18, 'Database': 19,
            'Hadoop': 20, 'ETL Developer': 21, 'DotNet Developer': 22, 'Blockchain': 23, 'Testing': 24}
id2label = {v: k for k, v in label2id.items()}

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text.strip()

def predict_resume(text):
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
    with torch.no_grad():
        logits = model(**encoding).logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        pred_idx = probs.argmax()
        confidence = probs[pred_idx]
    return id2label[pred_idx], confidence

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/predict", methods=["POST"])
def predict():
    file = request.files["resume"]
    if not file:
        return jsonify({"error": "No file uploaded"}), 400

    file_path = os.path.join("uploads", file.filename)
    os.makedirs("uploads", exist_ok=True)
    file.save(file_path)

    text = extract_text_from_pdf(file_path)
    category, confidence = predict_resume(text)

    os.remove(file_path)  # Clean up uploaded file
    return jsonify({
        "category": category,
        "confidence": f"{confidence*100:.2f}%"
    })

if __name__ == "__main__":
    app.run(debug=True)
"""

#  HTML frontend (index.html)
html_code = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Resume Classifier</title>
    <style>
        body { font-family: Arial, sans-serif; text-align: center; margin-top: 50px; }
        input[type=file], button { padding: 10px; margin: 10px; }
    </style>
</head>
<body>
    <h1>Resume Role Predictor</h1>
    <form id="uploadForm" enctype="multipart/form-data">
        <input type="file" name="resume" accept=".pdf" required><br>
        <button type="submit">Predict Role</button>
    </form>
    <div id="result"></div>

    <script>
        document.getElementById("uploadForm").onsubmit = async function(event) {
            event.preventDefault();
            const formData = new FormData(this);
            const response = await fetch("/predict", { method: "POST", body: formData });
            const result = await response.json();
            if (result.error) {
                alert(result.error);
            } else {
                document.getElementById("result").innerHTML =
                    `<h2> Predicted Role: ${result.category}</h2>
                     <h3> Confidence: ${result.confidence}</h3>`;
            }
        };
    </script>
</body>
</html>
"""

# Write files with utf-8 encoding
with open(os.path.join(project_dir, "app.py"), "w", encoding="utf-8") as f:
    f.write(app_code.strip())

with open(os.path.join(templates_dir, "index.html"), "w", encoding="utf-8") as f:
    f.write(html_code.strip())

print(" Web app created in:", project_dir)


 Web app created in: C:\Users\sagni\Downloads\Resume Selector
