In [9]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [10]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,ID,Name,Role,Transcript,Resume,decision,Reason_for_decision,Job_Description
0,jasojo159,Jason Jones,E-commerce Specialist,"Interviewer: Good morning, Jason. It's great t...",Here's a professional resume for Jason Jones:\...,reject,Lacked leadership skills for a senior position.,Be part of a passionate team at the forefront ...
1,annma759,Ann Marshall,Game Developer,Interview Scene\n\nA conference room with a ta...,Here's a professional resume for Ann Marshall:...,select,Strong technical skills in AI and ML.,Help us build the next-generation products as ...
2,patrmc729,Patrick Mcclain,Human Resources Specialist,Interview Setting: A conference room in a medi...,Here's a professional resume for Patrick Mccla...,reject,Insufficient system design expertise for senio...,We need a Human Resources Specialist to enhanc...
3,patrgr422,Patricia Gray,E-commerce Specialist,Here's a simulated professional interview for ...,Here's a professional resume for Patricia Gray...,select,Impressive leadership and communication abilit...,Be part of a passionate team at the forefront ...
4,amangr696,Amanda Gross,E-commerce Specialist,Here's the simulated interview:\n\nInterviewer...,Here's a professional resume for Amanda Gross:...,reject,Lacked leadership skills for a senior position.,We are looking for an experienced E-commerce S...


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10174 entries, 0 to 10173
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10174 non-null  object
 1   Name                 10174 non-null  object
 2   Role                 10174 non-null  object
 3   Transcript           10174 non-null  object
 4   Resume               10174 non-null  object
 5   decision             10174 non-null  object
 6   Reason_for_decision  10174 non-null  object
 7   Job_Description      10174 non-null  object
dtypes: object(8)
memory usage: 636.0+ KB


In [15]:
df.isnull().sum()

ID                     0
Name                   0
Role                   0
Transcript             0
Resume                 0
decision               0
Reason_for_decision    0
Job_Description        0
dtype: int64

In [16]:
y = df["decision"]


In [19]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [20]:
df["Resume"] = df["Resume"].apply(clean_text)


In [21]:
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df["Resume"])
y = df["Role"]


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [23]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [24]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8968058968058968

Classification Report:
                               precision    recall  f1-score   support

                 AI Engineer       0.00      0.00      0.00         4
               AI Researcher       0.96      1.00      0.98        46
             AR/VR Developer       1.00      1.00      1.00        46
        Blockchain Developer       1.00      1.00      1.00        46
            Business Analyst       1.00      0.91      0.95        45
             Cloud Architect       1.00      1.00      1.00        55
              Cloud Engineer       1.00      1.00      1.00        46
              Content Writer       1.00      1.00      1.00        44
       Cybersecurity Analyst       0.98      1.00      0.99        45
    Cybersecurity Specialist       0.00      0.00      0.00         1
                Data Analyst       0.93      0.83      0.88        65
              Data Architect       1.00      0.89      0.94        44
               Data Engineer       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
def predict_resume(resume_text):
    resume_text = clean_text(resume_text)
    resume_vector = vectorizer.transform([resume_text])
    prediction = model.predict(resume_vector)
    return prediction[0]


In [26]:
sample_resume = """
Python, Machine Learning, Data Science, Pandas, NumPy,
SQL, Data Visualization, NLP
"""

print("Predicted Category:", predict_resume(sample_resume))


Predicted Category: Data Scientist


In [27]:
df["Role"].value_counts()


Role
Data Scientist                  538
Software Engineer               480
Product Manager                 458
Data Engineer                   447
UI Engineer                     375
Data Analyst                    329
data engineer                   307
software engineer               307
product manager                 303
data scientist                  287
E-commerce Specialist           268
DevOps Engineer                 266
Machine Learning Engineer       265
Human Resources Specialist      262
Digital Marketing Specialist    260
Robotics Engineer               257
Cloud Architect                 254
Blockchain Developer            251
QA Engineer                     251
Mobile App Developer            247
Full Stack Developer            246
Database Administrator          243
Cloud Engineer                  240
Game Developer                  239
Content Writer                  238
AR/VR Developer                 237
Cybersecurity Analyst           234
UX Designer            