In [4]:
import pandas as pd
import re

print("Loading data...")
df = pd.read_csv(r"C:\Users\admin\Desktop\Code\Projects\Project-2\resume-job-description-fit\train.csv")

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_resume'] = df['resume_text'].apply(clean_text)
df['clean_jd'] = df['job_description_text'].apply(clean_text)

print("Step 1 completed - Data loaded and cleaned.")


Loading data...
Step 1 completed - Data loaded and cleaned.


In [5]:
from sklearn.preprocessing import LabelEncoder

print("Encoding labels...")
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])

print(f" Step 2 complete: Labels encoded → {list(le.classes_)}")


Encoding labels...
 Step 2 complete: Labels encoded → ['Good Fit', 'No Fit', 'Potential Fit']


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

print("Loading transformer model...")
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

torch.set_grad_enabled(False)

def get_cls_embeddings(texts, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=128)
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
        embeddings.append(cls_embeddings)
    return np.vstack(embeddings)

print("Extracting embeddings for resumes...")
resume_embeddings = get_cls_embeddings(df['clean_resume'].tolist())

print("Extracting embeddings for job descriptions...")
jd_embeddings = get_cls_embeddings(df['clean_jd'].tolist())

print("step 3 complete: Embeddings generated.")


In [15]:
print("Constructing feature vectors...")
X = np.hstack((resume_embeddings, jd_embeddings))
y = df['label_enc'].values

print(f"Step 4 complete: Final feature shape = {X.shape}")


Constructing feature vectors...
Step 4 complete: Final feature shape = (6241, 768)


In [16]:
from sklearn.model_selection import train_test_split

print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Step 5 complete: Train shape = {X_train.shape}, Test shape = {X_test.shape}")


Splitting data into train and test sets...
Step 5 complete: Train shape = (4992, 768), Test shape = (1249, 768)


In [17]:
import xgboost as xgb

print("Training XGBoost classifier...")
clf = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

clf.fit(X_train, y_train)

print("Step 6 completed - XGBoost model trained.")


Training XGBoost classifier...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Step 6 completed - XGBoost model trained.


In [18]:
from sklearn.metrics import classification_report

print("Evaluating model...")
y_pred = clf.predict(X_test)

print("Step 7 complete: Classification Report")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Evaluating model...
Step 7 complete: Classification Report
               precision    recall  f1-score   support

     Good Fit       0.65      0.76      0.70       303
       No Fit       0.73      0.77      0.75       658
Potential Fit       0.68      0.49      0.57       288

     accuracy                           0.70      1249
    macro avg       0.69      0.67      0.67      1249
 weighted avg       0.70      0.70      0.70      1249



In [19]:
import joblib
joblib.dump(clf, 'xgb_model.pkl')
joblib.dump(le, 'label_encoder.pkl')
print("project done")

project done
