In [None]:
import pandas as pd

# Load the Resume.csv file
data = pd.read_csv("Resume.csv")
print("File loaded successfully!")
data.head()


File loaded successfully!


Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [None]:
# Display basic information about the dataset
print("Dataset Information:")
print(data.info())

# Display the first 5 rows
print("\nFirst 5 Rows:")
data.head()

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB
None

First 5 Rows:

Missing Values:
ID             0
Resume_str     0
Resume_html    0
Category       0
dtype: int64


In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords (if not already downloaded)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_resume(text):
    text = re.sub(r'[^a-zA-Z ]', ' ', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply the cleaning function to Resume_str column
data['Cleaned_Resume'] = data['Resume_str'].apply(clean_resume)

# Display the cleaned resumes
print("Cleaned Resumes:")
data[['Resume_str', 'Cleaned_Resume']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaned Resumes:


Unnamed: 0,Resume_str,Cleaned_Resume
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,hr administrator marketing associate hr admini...
1,"HR SPECIALIST, US HR OPERATIONS ...",hr specialist us hr operations summary versati...
2,HR DIRECTOR Summary Over 2...,hr director summary years experience recruitin...
3,HR SPECIALIST Summary Dedica...,hr specialist summary dedicated driven dynamic...
4,HR MANAGER Skill Highlights ...,hr manager skill highlights hr skills hr depar...


**Text Vectorization**

To feed the text data into a machine learning model, we need to convert the cleaned resumes into numerical features. We'll use TF-IDF (Term frequency-Inverse Document Frequency), a common method for text vectorization.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words for efficiency

# Apply TF-IDF on the 'Cleaned_Resume' column
X = tfidf_vectorizer.fit_transform(data['Cleaned_Resume'])

# Check the shape of the transformed data
print("Shape of TF-IDF Matrix:", X.shape)


Shape of TF-IDF Matrix: (2484, 5000)


In [None]:
#encode target variable
from sklearn.preprocessing import LabelEncoder

# Encode the target variable (Category)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Category'])

# Display encoded labels and their original values
print("Encoded Labels:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{i}: {category}")


Encoded Labels:
0: ACCOUNTANT
1: ADVOCATE
2: AGRICULTURE
3: APPAREL
4: ARTS
5: AUTOMOBILE
6: AVIATION
7: BANKING
8: BPO
9: BUSINESS-DEVELOPMENT
10: CHEF
11: CONSTRUCTION
12: CONSULTANT
13: DESIGNER
14: DIGITAL-MEDIA
15: ENGINEERING
16: FINANCE
17: FITNESS
18: HEALTHCARE
19: HR
20: INFORMATION-TECHNOLOGY
21: PUBLIC-RELATIONS
22: SALES
23: TEACHER


In [None]:
from sklearn.model_selection import train_test_split

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)


Training Set Shape: (1987, 5000) (1987,)
Testing Set Shape: (497, 5000) (497,)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Model Accuracy: 0.6519114688128773

Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.67      0.83      0.74        24
              ADVOCATE       0.35      0.54      0.43        24
           AGRICULTURE       1.00      0.46      0.63        13
               APPAREL       0.67      0.21      0.32        19
                  ARTS       0.50      0.29      0.36        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.82      0.75      0.78        24
               BANKING       0.83      0.65      0.73        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.46      0.79      0.58        24
                  CHEF       0.81      0.71      0.76        24
          CONSTRUCTION       0.82      0.82      0.82        22
            CONSULTANT       0.44      0.17      0.25        23
              DESIGNER       0.84      0.76 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Resampled training data shape:", X_train_res.shape, y_train_res.shape)




Resampled training data shape: (2304, 5000) (2304,)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))


Random Forest Accuracy: 0.676056338028169

Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.64      0.96      0.77        24
              ADVOCATE       0.74      0.71      0.72        24
           AGRICULTURE       1.00      0.46      0.63        13
               APPAREL       0.75      0.32      0.44        19
                  ARTS       0.40      0.10      0.15        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.77      0.83      0.80        24
               BANKING       0.79      0.65      0.71        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.55      0.75      0.63        24
                  CHEF       0.83      0.79      0.81        24
          CONSTRUCTION       0.83      0.91      0.87        22
            CONSULTANT       0.50      0.22      0.30        23
              DESIGNER       0.80    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))


SVM Accuracy: 0.6338028169014085

Classification Report:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.68      0.79      0.73        24
              ADVOCATE       0.30      0.54      0.39        24
           AGRICULTURE       1.00      0.38      0.56        13
               APPAREL       0.57      0.21      0.31        19
                  ARTS       0.44      0.38      0.41        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.80      0.67      0.73        24
               BANKING       0.83      0.65      0.73        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.49      0.75      0.59        24
                  CHEF       0.85      0.71      0.77        24
          CONSTRUCTION       0.84      0.73      0.78        22
            CONSULTANT       0.33      0.30      0.32        23
              DESIGNER       0.79      0.71   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from transformers import pipeline

# Load the BERT pipeline
classifier = pipeline("text-classification", model="bert-base-uncased", return_all_scores=True)

# Truncate the input text to fit within the 512-token limit
resume_text = data['Cleaned_Resume'][0][:512]

# Get the classification result
results = classifier(resume_text)
print("Results:", results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


Results: [[{'label': 'LABEL_0', 'score': 0.550598680973053}, {'label': 'LABEL_1', 'score': 0.44940125942230225}]]


In [None]:
import joblib

# Save the trained model and vectorizer
joblib.dump(model, "resume_model.pkl")  # Trained classification model
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")  # TF-IDF vectorizer

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


In [None]:
import os

print("Files in Current Directory:", os.listdir())


Files in Current Directory: ['.config', 'resume_model.pkl', 'resume_reviewer.py', 'Resume.csv', 'tfidf_vectorizer.pkl', 'sample_data']


In [None]:
from google.colab import files
files.download("resume_model.pkl")
files.download("tfidf_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>