In [None]:
! pip install pdfminer.six

In [None]:
! pip install  seqeval transformers

In [None]:
import re
import json
import logging
import numpy as np
from tqdm import trange
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


def convert_goldparse(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines = []
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content'].replace("\n", " ")
            entities = []
            data_annotations = data['annotation']
            if data_annotations is not None:
                for annotation in data_annotations:
                    point = annotation['points'][0]
                    labels = annotation['label']
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point['start']
                        point_end = point['end']
                        point_text = point['text']

                        lstrip_diff = len(point_text) - \
                            len(point_text.lstrip())
                        rstrip_diff = len(point_text) - \
                            len(point_text.rstrip())
                        if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
                        if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
                        entities.append((point_start, point_end + 1, label))
            training_data.append((text, {"entities": entities}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " +
                          dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None


def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])
    return cleaned_data


def get_label(offset, labels):
    if offset[0] == 0 and offset[1] == 0:
        return 'O'
    for label in labels:
        if offset[1] >= label[0] and offset[0] <= label[1]:
            return label[2]
    return 'O'


tags_vals = ["UNKNOWN", "O", "Name", "Degree", "Skills", "College Name", "Email Address",
             "Designation", "Companies worked at", "Graduation Year", "Years of Experience", "Location"]

tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i: t for i, t in enumerate(tags_vals)}


def process_resume(data, tokenizer, tag2idx, max_len, is_test=False):
    tok = tokenizer.encode_plus(
        data[0], max_length=max_len, pad_to_max_length=True,return_offsets_mapping=True)
    curr_sent = {'orig_labels': [], 'labels': []}

    padding_length = max_len - len(tok['input_ids'])

    if not is_test:
        labels = data[1]['entities']
        labels.reverse()
        for off in tok['offset_mapping']:
            label = get_label(off, labels)
            curr_sent['orig_labels'].append(label)
            curr_sent['labels'].append(tag2idx[label])
        curr_sent['labels'] = curr_sent['labels'] + ([0] * padding_length)

    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent


class ResumeDataset(Dataset):
    def __init__(self, resume, tokenizer, tag2idx, max_len, is_test=False):
        self.resume = resume
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.resume)

    def __getitem__(self, idx):
        data = process_resume(
            self.resume[idx], self.tokenizer, self.tag2idx, self.max_len, self.is_test)
        return {
            'input_ids': torch.tensor(data['input_ids'], dtype=torch.long),
            'token_type_ids': torch.tensor(data['token_type_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(data['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(data['labels'], dtype=torch.long),
            'orig_label': data['orig_labels']
        }


def get_hyperparameters(model, ff):

    # ff: full_finetuning
    if ff:
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate": 0.0,
            },
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [
            {"params": [p for n, p in param_optimizer]}]

    return optimizer_grouped_parameters


def get_special_tokens(tokenizer, tag2idx):
    vocab = tokenizer.get_vocab()
    pad_tok = vocab["[PAD]"]
    sep_tok = vocab["[SEP]"]
    cls_tok = vocab["[CLS]"]
    o_lab = tag2idx["O"]

    return pad_tok, sep_tok, cls_tok, o_lab


def annot_confusion_matrix(valid_tags, pred_tags):
    """
    Create an annotated confusion matrix by adding label
    annotations and formatting to sklearn's `confusion_matrix`.
    """

    header = sorted(list(set(valid_tags + pred_tags)))

    matrix = confusion_matrix(valid_tags, pred_tags, labels=header)

    mat_formatted = [header[i] + "\t\t\t" +
                     str(row) for i, row in enumerate(matrix)]
    content = "\t" + " ".join(header) + "\n" + "\n".join(mat_formatted)

    return content


def flat_accuracy(valid_tags, pred_tags):
    return (np.array(valid_tags) == np.array(pred_tags)).mean()


def train_and_val_model(
    model,
    tokenizer,
    optimizer,
    epochs,
    idx2tag,
    tag2idx,
    max_grad_norm,
    device,
    train_dataloader,
    valid_dataloader
):

    pad_tok, sep_tok, cls_tok, o_lab = get_special_tokens(tokenizer, tag2idx)

    epoch = 0
    for _ in trange(epochs, desc="Epoch"):
        epoch += 1

        # Training loop
        print("Starting training loop.")
        model.train()
        tr_loss, tr_accuracy = 0, 0
        nb_tr_examples, nb_tr_steps = 0, 0
        tr_preds, tr_labels = [], []

        for step, batch in enumerate(train_dataloader):
            # Add batch to gpu

            # batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']
            b_input_ids, b_input_mask, b_labels = b_input_ids.to(
                device), b_input_mask.to(device), b_labels.to(device)

            # Forward pass
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            loss, tr_logits = outputs[:2]

            # Backward pass
            loss.backward()

            # Compute train loss
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

            # Subset out unwanted predictions on CLS/PAD/SEP tokens
            preds_mask = (
                (b_input_ids != cls_tok)
                & (b_input_ids != pad_tok)
                & (b_input_ids != sep_tok)
            )

            tr_logits = tr_logits.cpu().detach().numpy()
            tr_label_ids = torch.masked_select(b_labels, (preds_mask == 1))
            preds_mask = preds_mask.cpu().detach().numpy()
            tr_batch_preds = np.argmax(tr_logits[preds_mask.squeeze()], axis=1)
            tr_batch_labels = tr_label_ids.to("cpu").numpy()
            tr_preds.extend(tr_batch_preds)
            tr_labels.extend(tr_batch_labels)

            # Compute training accuracy
            tmp_tr_accuracy = flat_accuracy(tr_batch_labels, tr_batch_preds)
            tr_accuracy += tmp_tr_accuracy

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(
                parameters=model.parameters(), max_norm=max_grad_norm
            )

            # Update parameters
            optimizer.step()
            model.zero_grad()

        tr_loss = tr_loss / nb_tr_steps
        tr_accuracy = tr_accuracy / nb_tr_steps

        # Print training loss and accuracy per epoch
        print(f"Train loss: {tr_loss}")
        print(f"Train accuracy: {tr_accuracy}")

        """
        Validation loop
        """
        print("Starting validation loop.")

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        predictions, true_labels = [], []

        for batch in valid_dataloader:

            b_input_ids, b_input_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']
            b_input_ids, b_input_mask, b_labels = b_input_ids.to(
                device), b_input_mask.to(device), b_labels.to(device)

            with torch.no_grad():
                outputs = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels,
                )
                tmp_eval_loss, logits = outputs[:2]

            # Subset out unwanted predictions on CLS/PAD/SEP tokens
            preds_mask = (
                (b_input_ids != cls_tok)
                & (b_input_ids != pad_tok)
                & (b_input_ids != sep_tok)
            )

            logits = logits.cpu().detach().numpy()
            label_ids = torch.masked_select(b_labels, (preds_mask == 1))
            preds_mask = preds_mask.cpu().detach().numpy()
            val_batch_preds = np.argmax(logits[preds_mask.squeeze()], axis=1)
            val_batch_labels = label_ids.to("cpu").numpy()
            predictions.extend(val_batch_preds)
            true_labels.extend(val_batch_labels)

            tmp_eval_accuracy = flat_accuracy(
                val_batch_labels, val_batch_preds)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += b_input_ids.size(0)
            nb_eval_steps += 1

        # Evaluate loss, acc, conf. matrix, and class. report on devset
        
        pred_tags = [idx2tag[i] for i in predictions]
        valid_tags = [idx2tag[i] for i in true_labels]
        cl_report = classification_report(valid_tags, pred_tags)
        conf_mat = annot_confusion_matrix(valid_tags, pred_tags)
        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_steps

        # Report metrics
        print(f"Validation loss: {eval_loss}")
        print(f"Validation Accuracy: {eval_accuracy}")
        print(f"Classification Report:\n {cl_report}")
        print(f"Confusion Matrix:\n {conf_mat}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import numpy as np
from pdfminer.high_level import extract_text



def preprocess_data(data):
    text = extract_text(data)
    text = text.replace("\n", " ")
    text = text.replace("\f", " ")
    return text


def tokenize_resume(text, tokenizer, max_len):
    tok = tokenizer.encode_plus(
        text, max_length=max_len, pad_to_max_length=True, return_offsets_mapping=True)

    curr_sent = dict()

    padding_length = max_len - len(tok['input_ids'])

    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)

    final_data = {
        'input_ids': torch.tensor(curr_sent['input_ids'], dtype=torch.long),
        'token_type_ids': torch.tensor(curr_sent['token_type_ids'], dtype=torch.long),
        'attention_mask': torch.tensor(curr_sent['attention_mask'], dtype=torch.long),
        'offset_mapping': tok['offset_mapping']
    }

    return final_data


tags_vals = ["UNKNOWN", "O", "Name", "Degree", "Skills", "College Name", "Email Address",
             "Designation", "Companies worked at", "Graduation Year", "Years of Experience", "Location"]
idx2tag = {i: t for i, t in enumerate(tags_vals)}
resticted_lables = ["UNKNOWN", "O","Degree", "Skills", "College Name", "Email Address",
             "Designation", "Companies worked at", "Graduation Year", "Years of Experience", "Location"]


def predict(model, tokenizer, idx2tag, device, test_resume, max_len):
    model.eval()
    data = tokenize_resume(test_resume, tokenizer, max_len)
    input_ids, input_mask = data['input_ids'].unsqueeze(
        0), data['attention_mask'].unsqueeze(0)
    labels = torch.tensor([1] * input_ids.size(0),
                          dtype=torch.long).unsqueeze(0)

    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    labels = labels.to(device)

    with torch.no_grad():
        outputs = model(
            input_ids,
            token_type_ids=None,
            attention_mask=input_mask,
            labels=labels,
        )
        tmp_eval_loss, logits = outputs[:2]

    logits = logits.cpu().detach().numpy()
    label_ids = np.argmax(logits, axis=2)

    entities = []
    for label_id, offset in zip(label_ids[0], data['offset_mapping']):
        curr_id = idx2tag[label_id]
        curr_start = offset[0]
        curr_end = offset[1]
        if curr_id not in resticted_lables:
            if len(entities) > 0 and entities[-1]['entity'] == curr_id and curr_start - entities[-1]['end'] in [0, 1]:
                entities[-1]['end'] = curr_end
            else:
                entities.append(
                    {'entity': curr_id, 'start': curr_start, 'end': curr_end})
    for ent in entities:
        ent['text'] = test_resume[ent['start']:ent['end']]
    return entities


In [None]:
import argparse
import numpy as np
import torch
from transformers import BertForTokenClassification, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam

In [None]:
# parser = argparse.ArgumentParser(description='Train Bert-NER')
# parser.add_argument('-e', type=int, default=5, help='number of epochs')
# parser.add_argument('-o', type=str, default='.',
#                     help='output path to save model state')

# args = parser.parse_args().__dict__

# output_path = args['o']

MAX_LEN = 512
EPOCHS = 20
MAX_GRAD_NORM = 1.0
MODEL_NAME = 'bert-base-uncased'
TOKENIZER = BertTokenizerFast('/content/drive/MyDrive/Resume Data/Copy of vocab.txt', lowercase=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data = trim_entity_spans(convert_goldparse('/content/drive/MyDrive/Resume Data/Copy of Resumes.json'))

total = len(data)
train_data, val_data = data[:180], data[180:]

train_d = ResumeDataset(train_data, TOKENIZER, tag2idx, MAX_LEN)
val_d = ResumeDataset(val_data, TOKENIZER, tag2idx, MAX_LEN)

train_sampler = RandomSampler(train_d)
train_dl = DataLoader(train_d, sampler=train_sampler, batch_size=8)

val_dl = DataLoader(val_d, batch_size=8)

model = BertForTokenClassification.from_pretrained(
    MODEL_NAME, num_labels=len(tag2idx))
model.to(DEVICE)
optimizer_grouped_parameters = get_hyperparameters(model, True)
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

train_and_val_model(
    model,
    TOKENIZER,
    optimizer,
    EPOCHS,
    idx2tag,
    tag2idx,
    MAX_GRAD_NORM,
    DEVICE,
    train_dl,
    val_dl
)

torch.save(
    {
        "model_state_dict": model.state_dict()
    },
    f'/content/drive/MyDrive/Resume Data/model-state.bin',
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Starting training loop.
Train loss: 0.9055620457815088
Train accuracy: 0.7834522757238771
Starting validation loop.


  _warn_prf(average, modifier, msg_start, len(result))
Epoch:   5%|▌         | 1/20 [00:13<04:14, 13.39s/it]

Validation loss: 0.6617066979408264
Validation Accuracy: 0.8316991799301041
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.00      0.00      0.00       208
Companies worked at       0.00      0.00      0.00       236
             Degree       0.00      0.00      0.00       152
        Designation       0.00      0.00      0.00       323
      Email Address       0.67      0.64      0.66      1142
    Graduation Year       0.00      0.00      0.00        20
           Location       0.21      0.03      0.06       175
               Name       1.00      0.38      0.56       190
                  O       0.84      0.98      0.91     12880
             Skills       0.82      0.14      0.24       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.83     16243
          macro avg       0.32      0.20      0.22     16243
       weighted avg       0.78      0.83    

Epoch:  10%|█         | 2/20 [00:26<03:59, 13.28s/it]

Validation loss: 0.442998480796814
Validation Accuracy: 0.8445956919619755
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.39      0.35      0.37       208
Companies worked at       0.68      0.12      0.21       245
             Degree       1.00      0.18      0.30       152
        Designation       0.98      0.14      0.24       314
      Email Address       0.85      0.75      0.79      1142
    Graduation Year       0.00      0.00      0.00        20
           Location       0.85      0.27      0.41       175
               Name       0.74      0.95      0.83       190
                  O       0.89      0.93      0.91     12880
             Skills       0.42      0.60      0.49       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.84     16243
          macro avg       0.62      0.39      0.41     16243
       weighted avg       0.85      0.84     

Epoch:  15%|█▌        | 3/20 [00:39<03:42, 13.09s/it]

Validation loss: 0.4626018822193146
Validation Accuracy: 0.829826163000137
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.48      0.89      0.63       208
Companies worked at       0.50      0.47      0.48       236
             Degree       0.80      0.57      0.66       152
        Designation       0.81      0.51      0.62       323
      Email Address       0.88      0.72      0.79      1142
    Graduation Year       0.00      0.00      0.00        20
           Location       0.72      0.59      0.65       175
               Name       0.87      0.98      0.92       190
                  O       0.93      0.86      0.89     12880
             Skills       0.34      0.78      0.47       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.83     16243
          macro avg       0.57      0.58      0.56     16243
       weighted avg       0.87      0.83     

Epoch:  20%|██        | 4/20 [00:51<03:27, 12.96s/it]

Validation loss: 0.3637231796979904
Validation Accuracy: 0.876437739489627
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.54      0.87      0.67       208
Companies worked at       0.50      0.45      0.47       245
             Degree       0.91      0.61      0.73       152
        Designation       0.86      0.46      0.60       314
      Email Address       0.87      0.74      0.80      1142
    Graduation Year       0.00      0.00      0.00        20
           Location       0.66      0.75      0.70       175
               Name       0.94      0.98      0.96       190
                  O       0.93      0.93      0.93     12880
             Skills       0.52      0.68      0.59       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.88     16243
          macro avg       0.61      0.59      0.59     16243
       weighted avg       0.88      0.88     

Epoch:  25%|██▌       | 5/20 [01:04<03:13, 12.90s/it]

Validation loss: 0.3854489535093307
Validation Accuracy: 0.862582882265199
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.56      0.87      0.68       208
Companies worked at       0.43      0.69      0.53       236
             Degree       0.82      0.81      0.81       152
        Designation       0.73      0.65      0.68       323
      Email Address       0.80      0.79      0.80      1142
    Graduation Year       0.00      0.00      0.00        20
           Location       0.62      0.78      0.69       175
               Name       0.94      0.98      0.96       190
                  O       0.95      0.88      0.91     12880
             Skills       0.47      0.79      0.59       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.86     16243
          macro avg       0.57      0.66      0.61     16243
       weighted avg       0.89      0.86     

Epoch:  30%|███       | 6/20 [01:17<03:00, 12.91s/it]

Validation loss: 0.5148054778575897
Validation Accuracy: 0.8341089205539742
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.55      0.87      0.67       208
Companies worked at       0.50      0.60      0.55       245
             Degree       0.81      0.83      0.82       152
        Designation       0.76      0.65      0.70       314
      Email Address       0.83      0.76      0.79      1142
    Graduation Year       0.00      0.00      0.00        20
           Location       0.66      0.78      0.71       175
               Name       0.96      0.98      0.97       190
                  O       0.94      0.85      0.89     12880
             Skills       0.35      0.84      0.50       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.83     16243
          macro avg       0.58      0.65      0.60     16243
       weighted avg       0.88      0.83    

Epoch:  35%|███▌      | 7/20 [01:30<02:48, 12.98s/it]

Validation loss: 0.3851779639720917
Validation Accuracy: 0.8920879224708111
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.67      0.79      0.72       208
Companies worked at       0.51      0.50      0.50       236
             Degree       0.88      0.86      0.87       152
        Designation       0.81      0.60      0.69       323
      Email Address       0.84      0.77      0.80      1142
    Graduation Year       1.00      0.05      0.10        20
           Location       0.73      0.71      0.72       175
               Name       0.97      0.98      0.98       190
                  O       0.93      0.94      0.93     12880
             Skills       0.60      0.66      0.63       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.89     16243
          macro avg       0.72      0.62      0.63     16243
       weighted avg       0.89      0.89    

Epoch:  40%|████      | 8/20 [01:43<02:34, 12.91s/it]

Validation loss: 0.46301079988479615
Validation Accuracy: 0.878938669140506
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.64      0.79      0.71       208
Companies worked at       0.54      0.47      0.50       245
             Degree       0.98      0.79      0.87       152
        Designation       0.89      0.52      0.65       314
      Email Address       0.87      0.70      0.77      1142
    Graduation Year       0.75      0.15      0.25        20
           Location       0.73      0.74      0.73       175
               Name       0.97      0.98      0.98       190
                  O       0.92      0.93      0.93     12880
             Skills       0.51      0.72      0.60       902
Years of Experience       0.00      0.00      0.00        15

           accuracy                           0.88     16243
          macro avg       0.71      0.62      0.64     16243
       weighted avg       0.89      0.88    

Epoch:  45%|████▌     | 9/20 [01:55<02:21, 12.83s/it]

Validation loss: 0.5326636373996735
Validation Accuracy: 0.8528441615120587
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.59      0.87      0.70       208
Companies worked at       0.42      0.67      0.52       236
             Degree       0.90      0.86      0.88       152
        Designation       0.74      0.62      0.68       323
      Email Address       0.79      0.79      0.79      1142
    Graduation Year       0.33      0.15      0.21        20
           Location       0.70      0.77      0.73       175
               Name       0.96      0.98      0.97       190
                  O       0.94      0.87      0.91     12880
             Skills       0.41      0.79      0.54       902
Years of Experience       0.67      0.27      0.38        15

           accuracy                           0.85     16243
          macro avg       0.68      0.69      0.66     16243
       weighted avg       0.88      0.85    

Epoch:  50%|█████     | 10/20 [02:08<02:07, 12.79s/it]

Validation loss: 0.5577559053897858
Validation Accuracy: 0.8580799844603251
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.56      0.89      0.69       208
Companies worked at       0.45      0.66      0.54       245
             Degree       0.86      0.88      0.87       152
        Designation       0.75      0.62      0.68       314
      Email Address       0.75      0.84      0.79      1142
    Graduation Year       0.26      0.30      0.28        20
           Location       0.71      0.79      0.75       175
               Name       0.94      0.98      0.96       190
                  O       0.95      0.88      0.91     12880
             Skills       0.44      0.72      0.54       902
Years of Experience       0.53      0.53      0.53        15

           accuracy                           0.86     16243
          macro avg       0.65      0.74      0.69     16243
       weighted avg       0.88      0.86    

Epoch:  55%|█████▌    | 11/20 [02:21<01:56, 12.91s/it]

Validation loss: 0.5489378273487091
Validation Accuracy: 0.870209489562846
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.56      0.89      0.69       208
Companies worked at       0.43      0.66      0.52       236
             Degree       0.88      0.89      0.89       152
        Designation       0.79      0.61      0.69       323
      Email Address       0.80      0.79      0.80      1142
    Graduation Year       0.28      0.35      0.31        20
           Location       0.70      0.77      0.73       175
               Name       0.95      1.00      0.98       190
                  O       0.94      0.90      0.92     12880
             Skills       0.49      0.72      0.58       902
Years of Experience       0.80      0.53      0.64        15

           accuracy                           0.87     16243
          macro avg       0.69      0.74      0.70     16243
       weighted avg       0.89      0.87     

Epoch:  60%|██████    | 12/20 [02:34<01:43, 12.90s/it]

Validation loss: 0.5769694089889527
Validation Accuracy: 0.8615741478486371
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.65      0.84      0.74       208
Companies worked at       0.51      0.55      0.53       245
             Degree       0.89      0.89      0.89       152
        Designation       0.81      0.62      0.70       314
      Email Address       0.82      0.77      0.80      1142
    Graduation Year       0.22      0.30      0.26        20
           Location       0.74      0.80      0.77       175
               Name       0.99      0.98      0.98       190
                  O       0.94      0.89      0.91     12880
             Skills       0.42      0.77      0.55       902
Years of Experience       0.83      0.33      0.48        15

           accuracy                           0.86     16243
          macro avg       0.71      0.70      0.69     16243
       weighted avg       0.89      0.86    

Epoch:  65%|██████▌   | 13/20 [02:47<01:30, 12.87s/it]

Validation loss: 0.677402138710022
Validation Accuracy: 0.8528887375328008
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.58      0.88      0.70       208
Companies worked at       0.51      0.41      0.45       236
             Degree       0.88      0.90      0.89       152
        Designation       0.83      0.52      0.64       323
      Email Address       0.79      0.82      0.81      1142
    Graduation Year       0.28      0.40      0.33        20
           Location       0.78      0.72      0.75       175
               Name       0.97      0.99      0.98       190
                  O       0.94      0.87      0.91     12880
             Skills       0.39      0.79      0.52       902
Years of Experience       1.00      0.40      0.57        15

           accuracy                           0.85     16243
          macro avg       0.72      0.70      0.69     16243
       weighted avg       0.88      0.85     

Epoch:  70%|███████   | 14/20 [03:00<01:17, 12.84s/it]

Validation loss: 0.6539891421794891
Validation Accuracy: 0.8538651253566474
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.59      0.88      0.71       208
Companies worked at       0.43      0.60      0.50       245
             Degree       0.88      0.88      0.88       152
        Designation       0.80      0.60      0.68       314
      Email Address       0.78      0.80      0.79      1142
    Graduation Year       0.23      0.35      0.28        20
           Location       0.59      0.79      0.68       175
               Name       0.94      1.00      0.97       190
                  O       0.94      0.88      0.91     12880
             Skills       0.42      0.72      0.53       902
Years of Experience       0.53      0.53      0.53        15

           accuracy                           0.85     16243
          macro avg       0.65      0.73      0.68     16243
       weighted avg       0.88      0.85    

Epoch:  75%|███████▌  | 15/20 [03:13<01:04, 12.85s/it]

Validation loss: 0.663453608751297
Validation Accuracy: 0.8622551423698225
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.58      0.86      0.69       208
Companies worked at       0.40      0.57      0.47       236
             Degree       0.88      0.89      0.88       152
        Designation       0.84      0.52      0.64       323
      Email Address       0.75      0.89      0.82      1142
    Graduation Year       0.24      0.40      0.30        20
           Location       0.69      0.82      0.75       175
               Name       0.94      1.00      0.97       190
                  O       0.95      0.88      0.91     12880
             Skills       0.44      0.74      0.55       902
Years of Experience       0.40      0.53      0.46        15

           accuracy                           0.86     16243
          macro avg       0.65      0.74      0.68     16243
       weighted avg       0.89      0.86     

Epoch:  80%|████████  | 16/20 [03:25<00:51, 12.77s/it]

Validation loss: 0.6642807126045227
Validation Accuracy: 0.8611886041901139
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.63      0.86      0.73       208
Companies worked at       0.44      0.54      0.48       245
             Degree       0.90      0.89      0.90       152
        Designation       0.82      0.60      0.69       314
      Email Address       0.77      0.82      0.79      1142
    Graduation Year       0.27      0.30      0.29        20
           Location       0.70      0.80      0.75       175
               Name       1.00      0.99      0.99       190
                  O       0.94      0.88      0.91     12880
             Skills       0.43      0.77      0.56       902
Years of Experience       0.42      0.53      0.47        15

           accuracy                           0.86     16243
          macro avg       0.67      0.73      0.69     16243
       weighted avg       0.89      0.86    

Epoch:  85%|████████▌ | 17/20 [03:38<00:38, 12.75s/it]

Validation loss: 0.6542452692985534
Validation Accuracy: 0.8650340089828445
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.58      0.88      0.69       208
Companies worked at       0.48      0.58      0.53       236
             Degree       0.89      0.89      0.89       152
        Designation       0.84      0.57      0.68       323
      Email Address       0.85      0.78      0.81      1142
    Graduation Year       0.30      0.35      0.33        20
           Location       0.74      0.77      0.75       175
               Name       0.97      0.99      0.98       190
                  O       0.94      0.89      0.92     12880
             Skills       0.43      0.75      0.54       902
Years of Experience       0.86      0.40      0.55        15

           accuracy                           0.86     16243
          macro avg       0.72      0.71      0.70     16243
       weighted avg       0.89      0.86    

Epoch:  90%|█████████ | 18/20 [03:51<00:25, 12.71s/it]

Validation loss: 0.6656005442142486
Validation Accuracy: 0.8601722273514605
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.60      0.89      0.72       208
Companies worked at       0.49      0.60      0.54       245
             Degree       0.87      0.86      0.86       152
        Designation       0.80      0.55      0.66       314
      Email Address       0.83      0.75      0.78      1142
    Graduation Year       0.17      0.40      0.24        20
           Location       0.67      0.79      0.73       175
               Name       0.98      0.99      0.98       190
                  O       0.94      0.89      0.91     12880
             Skills       0.44      0.79      0.56       902
Years of Experience       0.35      0.40      0.38        15

           accuracy                           0.86     16243
          macro avg       0.65      0.72      0.67     16243
       weighted avg       0.89      0.86    

Epoch:  95%|█████████▌| 19/20 [04:03<00:12, 12.68s/it]

Validation loss: 0.6782584965229035
Validation Accuracy: 0.8606052299969758
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.59      0.88      0.70       208
Companies worked at       0.48      0.54      0.51       236
             Degree       0.74      0.89      0.81       152
        Designation       0.80      0.61      0.69       323
      Email Address       0.79      0.82      0.80      1142
    Graduation Year       0.19      0.40      0.25        20
           Location       0.72      0.81      0.77       175
               Name       0.97      0.98      0.97       190
                  O       0.94      0.88      0.91     12880
             Skills       0.43      0.77      0.55       902
Years of Experience       0.60      0.40      0.48        15

           accuracy                           0.86     16243
          macro avg       0.66      0.73      0.68     16243
       weighted avg       0.89      0.86    

Epoch: 100%|██████████| 20/20 [04:16<00:00, 12.83s/it]

Validation loss: 0.6240813970565796
Validation Accuracy: 0.8770425761358054
Classification Report:
                      precision    recall  f1-score   support

       College Name       0.66      0.81      0.73       208
Companies worked at       0.49      0.47      0.48       245
             Degree       0.87      0.86      0.86       152
        Designation       0.84      0.53      0.65       314
      Email Address       0.80      0.76      0.78      1142
    Graduation Year       0.17      0.35      0.23        20
           Location       0.63      0.81      0.71       175
               Name       0.96      0.98      0.97       190
                  O       0.94      0.91      0.92     12880
             Skills       0.52      0.77      0.62       902
Years of Experience       0.69      0.60      0.64        15

           accuracy                           0.88     16243
          macro avg       0.69      0.71      0.69     16243
       weighted avg       0.89      0.88    




In [None]:
import io
import argparse
import torch
from transformers import BertTokenizerFast, BertForTokenClassification
 
MAX_LEN = 500
NUM_LABELS = 12
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = 'bert-base-uncased'
STATE_DICT = torch.load("/content/drive/MyDrive/Resume Data/model-state.bin", map_location=DEVICE)
TOKENIZER = BertTokenizerFast("/content/drive/MyDrive/Resume Data/Copy of vocab.txt", lowercase=True)
 
model = BertForTokenClassification.from_pretrained(
 'bert-base-uncased', state_dict=STATE_DICT['model_state_dict'], num_labels=NUM_LABELS)
model.to(DEVICE)


 
data = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/Android-iOS-Flutter/Shariful Islam.pdf', 'rb').read())
resume_text = preprocess_data(data)
entities = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text, MAX_LEN)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
entities

[{'end': 4, 'entity': 'Name', 'start': 0, 'text': 'Name'},
 {'end': 24, 'entity': 'Name', 'start': 6, 'text': 'Md. Shariful Islam'}]

In [None]:
import io
data1 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Akash kundu.pdf', 'rb').read())
resume_text1 = preprocess_data(data1)
entities1 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text1, MAX_LEN)



In [None]:
entities1

[{'end': 5, 'entity': 'Name', 'start': 0, 'text': 'P E R'}]

In [None]:
data3 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Akif Ahmed.pdf', 'rb').read())
resume_text3 = preprocess_data(data3)
entities3 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text3, MAX_LEN)



In [None]:
entities3

[{'end': 129, 'entity': 'Name', 'start': 108, 'text': 'Syed Akif Ahmed Karim'}]

In [None]:
data3 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Ariful Islam.pdf', 'rb').read())
resume_text3 = preprocess_data(data3)
entities3 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text3, MAX_LEN)



In [None]:
entities3

[{'end': 129, 'entity': 'Name', 'start': 108, 'text': 'Syed Akif Ahmed Karim'}]

In [None]:
data4 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Bidhan kumar Bhownick.pdf', 'rb').read())
resume_text4 = preprocess_data(data4)
entities4 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text4, MAX_LEN)



In [None]:
entities4

[{'end': 12, 'entity': 'Name', 'start': 11, 'text': 'B'},
 {'end': 44, 'entity': 'Name', 'start': 23, 'text': 'BIDHAN KUMAR BHOWMICK'}]

In [None]:
data5 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Diponkor Roy.pdf', 'rb').read())
resume_text5 = preprocess_data(data5)
entities5 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text5, MAX_LEN)



In [None]:
entities5

[{'end': 12, 'entity': 'Name', 'start': 0, 'text': 'Diponkor Roy'},
 {'end': 224, 'entity': 'Name', 'start': 222, 'text': 'on'}]

In [None]:
data6 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Jannatul Ferdous Meem.pdf', 'rb').read())
resume_text6 = preprocess_data(data6)
entities6 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text6, MAX_LEN)



In [None]:
entities6

[{'end': 21, 'entity': 'Name', 'start': 0, 'text': 'Jannatul Ferdous[Meem'}]

In [None]:
data7 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Kamrul Islam.pdf', 'rb').read())
resume_text7 = preprocess_data(data7)
entities7 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text7, MAX_LEN)



In [None]:
entities7

[{'end': 508, 'entity': 'Name', 'start': 500, 'text': '. KAMRUL'},
 {'end': 1678, 'entity': 'Name', 'start': 1676, 'text': 'Ma'},
 {'end': 1685, 'entity': 'Name', 'start': 1682, 'text': 'Ash'}]

In [None]:
data8 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/Kibtia Chowdhury.pdf', 'rb').read())
resume_text8 = preprocess_data(data8)
entities8 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text8, MAX_LEN)



In [None]:
entities8

[{'end': 43, 'entity': 'Name', 'start': 27, 'text': 'KIBTIA CHOWDHURY'}]

In [None]:
data9 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/MARZAHAN_SULTANA_CV_SQAT.pdf', 'rb').read())
resume_text9 = preprocess_data(data9)
entities9 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text9, MAX_LEN)



In [None]:
entities9

[{'end': 24, 'entity': 'Name', 'start': 3, 'text': 'MARZAHAN SULTANA PILI'}]

In [None]:
data10 = io.BytesIO(open('/content/drive/MyDrive/Applicants Resume/DevOps-IT-QA/MD. ABU MAS-UD SAYEED.pdf', 'rb').read())
resume_text10 = preprocess_data(data10)
entities10 = predict(model, TOKENIZER, idx2tag,
 DEVICE, resume_text10, MAX_LEN)



In [None]:
for i in entities9:
  if (len(i["text"])>10):
    print(i["text"])

MARZAHAN SULTANA PILI
