<a href="https://colab.research.google.com/github/subhashjprasad/machine-learning-projects/blob/main/resumeClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preparing Data

In [1]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict

In [2]:
!pip install -q transformers datasets

In [3]:
resume_data = pd.read_csv("Resume.csv")

resume_data.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
resume_data = pd.DataFrame(resume_data)

resume_data = resume_data.drop(columns = ['ID', 'Resume_html'])

resume_data = pd.get_dummies(resume_data, columns = ['Category'])

In [5]:
resume_data = Dataset.from_pandas(resume_data)

train_testvalid = resume_data.train_test_split(test_size = 0.1)
test_valid = train_testvalid['test'].train_test_split(test_size = 0.5)
resume_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [6]:
resume_dataset

DatasetDict({
    train: Dataset({
        features: ['Resume_str', 'Category_ACCOUNTANT', 'Category_ADVOCATE', 'Category_AGRICULTURE', 'Category_APPAREL', 'Category_ARTS', 'Category_AUTOMOBILE', 'Category_AVIATION', 'Category_BANKING', 'Category_BPO', 'Category_BUSINESS-DEVELOPMENT', 'Category_CHEF', 'Category_CONSTRUCTION', 'Category_CONSULTANT', 'Category_DESIGNER', 'Category_DIGITAL-MEDIA', 'Category_ENGINEERING', 'Category_FINANCE', 'Category_FITNESS', 'Category_HEALTHCARE', 'Category_HR', 'Category_INFORMATION-TECHNOLOGY', 'Category_PUBLIC-RELATIONS', 'Category_SALES', 'Category_TEACHER'],
        num_rows: 2235
    })
    test: Dataset({
        features: ['Resume_str', 'Category_ACCOUNTANT', 'Category_ADVOCATE', 'Category_AGRICULTURE', 'Category_APPAREL', 'Category_ARTS', 'Category_AUTOMOBILE', 'Category_AVIATION', 'Category_BANKING', 'Category_BPO', 'Category_BUSINESS-DEVELOPMENT', 'Category_CHEF', 'Category_CONSTRUCTION', 'Category_CONSULTANT', 'Category_DESIGNER', 'Category_

In [75]:
example = resume_dataset['train'][30]
example

{'Resume_str': "         PROGRAM MANAGER       Professional Summary    Highly-motivated community service professional skilled at networking, media outreach and relationship development. Flexible and versatile team player who maintains a sense of humor under pressure.      Core Qualifications          Citizen engagement  Employee relations  Media relations  Inter-governmental and legislative affairs  Social media  Event planning and logistics  Public speaking  Copywriting and copyediting  Microsoft Word and Excel expertise  Strong communication skills              Experience     10/2015   to   08/2016     Program Manager    Company Name   －   City  ,   State      Contributed to relevant blogs, conferences and events both off-line and online to increase brand awareness.       Managed the complete redesign and launch of the company's website in  [Number]  months.       Created an official company page on Facebook to facilitate interaction with customers.       Managed all social media pr

In [8]:
labels = [label for label in resume_dataset['train'].features.keys() if label not in ['Resume_str']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['Category_ACCOUNTANT',
 'Category_ADVOCATE',
 'Category_AGRICULTURE',
 'Category_APPAREL',
 'Category_ARTS',
 'Category_AUTOMOBILE',
 'Category_AVIATION',
 'Category_BANKING',
 'Category_BPO',
 'Category_BUSINESS-DEVELOPMENT',
 'Category_CHEF',
 'Category_CONSTRUCTION',
 'Category_CONSULTANT',
 'Category_DESIGNER',
 'Category_DIGITAL-MEDIA',
 'Category_ENGINEERING',
 'Category_FINANCE',
 'Category_FITNESS',
 'Category_HEALTHCARE',
 'Category_HR',
 'Category_INFORMATION-TECHNOLOGY',
 'Category_PUBLIC-RELATIONS',
 'Category_SALES',
 'Category_TEACHER']

Preprocessing Data

In [9]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Resume_str"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length = 512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
encoded_dataset = resume_dataset.map(preprocess_data, batched=True, remove_columns = resume_dataset['train'].column_names)

Map:   0%|          | 0/2235 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/124 [00:00<?, ? examples/s]

In [11]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [12]:
tokenizer.decode(example['input_ids'])

'[CLS] sales summary to obtain a position where i can utilize my skills and work in an environment that will enhance my knowledge and career. great organization and communication skills that will aid in excellent customer service and satisfaction. highlights bi - lingual multi - line system expert superior communication skills data entry claims expert install coordinator proficient in ordering materials payroll cheerful and energetic effective team player superior organization skills dependable and reliable goal oriented self motivated experience sales 03 / 2016 to current company name city, state managing job after sale to completion of install handling claims invoices ordering material exchanges and returns track down custom orders to make sure they arrive on time up - selling customers on special materialsmanaged wide variety of customer services and administrative tasks to resolve customer issues quickly and efficiently install sales coordinator 03 / 2014 to 03 / 2016 company name 

In [13]:
example['labels']

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0]

In [14]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['Category_SALES']

In [15]:
encoded_dataset.set_format("torch")

Defining and Training Model

In [16]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
batch_size = 8
metric_name = "f1"

In [18]:
!pip install -U accelerate
!pip install -U transformers



In [19]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [20]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [21]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [22]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  4341, 12654,  2000,  6855,  1037,  2597,  2073,  1045,  2064,
        16462,  2026,  4813,  1998,  2147,  1999,  2019,  4044,  2008,  2097,
        11598,  2026,  3716,  1998,  2476,  1012,  2307,  3029,  1998,  4807,
         4813,  2008,  2097,  4681,  1999,  6581,  8013,  2326,  1998,  9967,
         1012, 11637, 12170,  1011, 17002,  8787,  4800,  1011,  2240,  2291,
         6739,  6020,  4807,  4813,  2951,  4443,  4447,  6739, 16500, 10669,
        27029,  1999, 13063,  4475, 26854, 18350,  1998, 18114,  4621,  2136,
         2447,  6020,  3029,  4813, 12530,  3085,  1998, 10539,  3125,  8048,
         2969, 12774,  3325,  4341,  6021,  1013,  2355,  2000,  2783,  2194,
         2171,  2103,  1010,  2110,  6605,  3105,  2044,  5096,  2000,  6503,
         1997, 16500,  8304,  4447,  1999,  6767, 23522, 13063,  3430, 15800,
         1998,  5651,  2650,  2091,  7661,  4449,  2000,  2191,  2469,  2027,
         7180,  2006,  2051,  2039,  1011,  4855,  6304,  2006, 

In [23]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.6681, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.4837, -0.3710,  0.2512,  0.3977, -0.7446,  0.4706, -0.7448,  0.3009,
         -0.2457, -0.4882,  0.1009, -0.1964, -0.1705, -0.4283, -0.2520,  0.5075,
         -0.0312,  0.8559, -0.3040, -0.4290,  0.1912,  0.1854,  0.3691, -0.2324]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.170383,0.0,0.5,0.0
2,0.201700,0.128177,0.0,0.5,0.0
3,0.201700,0.097693,0.310811,0.592567,0.185484
4,0.113900,0.084257,0.723618,0.789797,0.580645
5,0.113900,0.080587,0.77512,0.825912,0.653226


TrainOutput(global_step=1400, training_loss=0.1376846068246024, metrics={'train_runtime': 1223.1762, 'train_samples_per_second': 9.136, 'train_steps_per_second': 1.145, 'total_flos': 2940846831820800.0, 'train_loss': 0.1376846068246024, 'epoch': 5.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.08058725297451019,
 'eval_f1': 0.7751196172248804,
 'eval_roc_auc': 0.8259116409537167,
 'eval_accuracy': 0.6532258064516129,
 'eval_runtime': 4.1268,
 'eval_samples_per_second': 30.047,
 'eval_steps_per_second': 3.877,
 'epoch': 5.0}

Predicting on New Resume

In [90]:
resume_text = '''         PROGRAM MANAGER       Professional Summary    Highly-motivated community service professional skilled at networking, media outreach and relationship development. Flexible and versatile team player who maintains a sense of humor under pressure.      Core Qualifications          Citizen engagement  Employee relations  Media relations  Inter-governmental and legislative affairs  Social media  Event planning and logistics  Public speaking  Copywriting and copyediting  Microsoft Word and Excel expertise  Strong communication skills              Experience     10/2015   to   08/2016     Program Manager    Company Name   －   City  ,   State      Contributed to relevant blogs, conferences and events both off-line and online to increase brand awareness.       Managed the complete redesign and launch of the company's website in  [Number]  months.       Created an official company page on Facebook to facilitate interaction with customers.       Managed all social media programs, including Internet forums, blogs, social networking applications and message boards.      Presented on current promotions to the public at events and tradeshows.      Wrote newsletter marketing copy and presentation materials for special projects.     Established effective working relationships with clients, government officials and media representatives.     Developed and implemented communication strategies and information programs.     Gathered and analyzed data on community needs and interests.     Developed and published a monthly citizen newsletter.       Organized public appearances, lectures, contests and exhibits to increase product awareness.     Designed web and other content, including monthly newsletters and promotional calendars.         Managed the editorial content, design and distribution of the external company newsletter.       Conferred with production, graphic design and web-design personnel to coordinate production of corporate communications materials.         Developed corporate communications strategies and programs, including project timelines.         Coached less experienced public relations staff members on corporate communications practices.         Proofread and reviewed all print and electronic content for correct grammar and adherence to house style.         Revised campaigns in response to feedback from the creative director, account team and clients.         Reviewed and edited colleagues' written work for grammar, tone, voice and creative quality.           Implemented SEO strategy, resulting in  [Number] % increase to website hits.         Used software to manage efficient delivery and track content drafts.           Communicated with designers, graphic producers, video editors and videographers to create cohesive company voice.           Researched industrial and technical information to quickly come up to speed with unfamiliar industries.           Maintained awareness of digital trends and new emerging technologies and platforms.             08/2011   to   Current     Office of State Representative Intern District Director      City  ,   State      Recruited, trained and supervised 8-12 new staff members, interns and volunteers each year.  Reviewed staff work and gave comprehensive and constructive feedback.  Developed a 28-page training manual for new interns and volunteers.  Developed training program for specific, assigned job tasks, including database management and constituent casework.  Drafted meeting agendas, supplied advance materials and executed follow-up for meetings and team conferences.  Planned and publicized events, including securing more than $150,000 in sponsorships.  Edited and distributed press releases and pitches to local and national media outlets, securing positive coverage in multiple publications.  Attended community meetings and forums to answer questions, address complaints and explain procedures.  Collaborated with community members to educate the public regarding issues such as constitutional amendments and newly enacted legislation.  Received and screened a high volume of internal and external communications, including email and mail.  Simplified topics such as healthcare, energy, government and technology through clear, concise and compelling writing.  Wrote newsletter copy and presentation materials for special projects.  Created and maintained spreadsheets using advanced Excel functions and calculations to develop reports and lists.         01/2012   to   05/2012     Intern    Company Name   －   City  ,   State      Identified customer needs through market research and analysis.  Tracked communication regarding clients using print and electronic media.          Education          Bachelorof Science  :   Public Relations    University of Texas at Austin          Public Relations            Bachelor of Journalism  :   Broadcast    University of Texas   －   City        Broadcast          Skills    Excellent interpersonal skills, Strong communication skills, concise, conferences, copyediting, Copywriting, clients, database management, email, Employee relations, Event planning, government, logistics, market research and analysis, materials, Media relations, meetings, Excel, mail, Microsoft Word, newsletter, page, press releases, problem solver, Public speaking, publications, Fast learner, Self-starter, spreadsheets'''

In [91]:
encoding = tokenizer(resume_text, return_tensors="pt", padding="max_length", truncation=True, max_length = 512)
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

logits = outputs.logits

# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
idx = torch.argmax(probs)
# turn predicted id's into actual label names
predicted_label = id2label[idx.item()]
print(predicted_label[9:] + '\n')
print("All Probabilities: ")
all_probs = []
for i in range(len(probs)):
    all_probs.append((id2label[i][9:] + ": ", str(probs[i].item())))
all_probs.sort(key = lambda a: a[1], reverse = True)
for i in all_probs:
    print(i[0] + i[1])

PUBLIC-RELATIONS

All Probabilities: 
PUBLIC-RELATIONS: 0.5161656141281128
DIGITAL-MEDIA: 0.08589410781860352
ARTS: 0.07381365448236465
BUSINESS-DEVELOPMENT: 0.0639791414141655
ADVOCATE: 0.05970282480120659
BANKING: 0.0573815256357193
AGRICULTURE: 0.04858798161149025
APPAREL: 0.046936552971601486
CONSULTANT: 0.04624016582965851
DESIGNER: 0.045550405979156494
FINANCE: 0.0427267886698246
SALES: 0.041872330009937286
FITNESS: 0.04118255153298378
CHEF: 0.03962745890021324
HEALTHCARE: 0.038089483976364136
AVIATION: 0.03517140820622444
HR: 0.0290546715259552
INFORMATION-TECHNOLOGY: 0.02706136181950569
BPO: 0.026406222954392433
ACCOUNTANT: 0.024931924417614937
AUTOMOBILE: 0.021704308688640594
CONSTRUCTION: 0.021369121968746185
ENGINEERING: 0.02099710702896118
TEACHER: 0.017785513773560524
