Install Required Libraries

In [1]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
nlp = spacy.load("en_core_web_sm")


 Load Sample Dataset

In [3]:
df = pd.read_csv("/content/UpdatedResumeDataSet.csv")


Preprocess Text with spaCy  (Tokenization + Stopword Removal + Lemmatization
python)

In [4]:
def preprocess_text(text):
    doc = nlp(str(text).lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

df['Cleaned_Resume'] = df['Resume'].apply(preprocess_text)

TF-IDF Vectorization

In [5]:
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['Cleaned_Resume'])

In [6]:
le = LabelEncoder()
y = le.fit_transform(df['Category'])


Train-Test Split and Model Training

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

Evaluation

In [13]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)



In [14]:
print("✅ Accuracy:", accuracy)
print(report)


✅ Accuracy: 0.9896373056994818
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      1.00      1.00         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      0.92      0.96        12
                   Hadoop     

Prediction

In [16]:
def predict_resume(text):
    cleaned = preprocess_text(text)
    vec = vectorizer.transform([cleaned])
    prediction = model.predict(vec)[0]
    return le.inverse_transform([prediction])[0]

# Try a new resume
resume = "Developed machine learning models in Python using Pandas and Scikit-learn."
print("Predicted category:", predict_resume(resume))

Predicted category: Data Science
