In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%%capture
pip install transformers

In [None]:
import os
import re
import torch 
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.optim import Adam
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import BertTokenizerFast, BertForTokenClassification, BertTokenizer, BertConfig, GPT2Tokenizer, GPT2TokenizerFast
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader

In [None]:
ROOT_DIR = "/content/gdrive/MyDrive/ResumeRanker"

In [None]:
def extract_data_from_json(filepath):
    text_dataset = []
    dataset = []
    with open(filepath, 'r') as f:
        lines = f.readlines()

    for line in tqdm(lines,desc='Extracting Data    '):
        data = json.loads(line)
        text = data['content'].replace("\n", " ")
        data_annotations = data['annotation']
        entities = []
        if data_annotations is not None:
            for annotation in data_annotations:
                point = annotation['points'][0]
                labels = annotation['label']
                if isinstance(labels, list):
                    if not labels:
                        continue
                    label = labels[0]
                else:
                    label = labels

                point_start = point['start']
                point_end = point['end']
                point_text = point['text']
                
                lspace = len(point_text) - len(point_text.lstrip())
                rspace = len(point_text) - len(point_text.rstrip())
                if lspace != 0:
                    point_start = point_start + lspace
                if rspace != 0:
                    point_end = point_end - rspace
                entities.append((point_start, point_end + 1 , label))
        dataset.append((text, {"entities" : entities}))
        text_dataset.append(text)

    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in tqdm(dataset,desc='Processing Entities'):
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    labels = []
    s = 0
    for i in tqdm(range(len(cleaned_data)),desc='Creating Labels    '):
        start = 0
        emptyList = ["Empty"] * len(cleaned_data[i][0].split())
        numberOfWords = 0
        lenOfString = len(cleaned_data[i][0])
        strData = cleaned_data[i][0]
        strDictData = cleaned_data[i][1]
        lastIndexOfSpace = strData.rfind(' ')
        for i in range(lenOfString):
            if (strData[i]==" " and strData[i+1]!=" "):
                for k,v in strDictData.items():
                    for j in range(len(v)):
                        entList = v[len(v)-j-1]
                        if (start>=int(entList[0]) and i<=int(entList[1])):
                            emptyList[numberOfWords] = entList[2]
                            break
                        else:
                            continue
                start = i + 1  
                numberOfWords += 1
            if (i == lastIndexOfSpace):
                for j in range(len(v)):
                        entList = v[len(v)-j-1]
                        if (lastIndexOfSpace>=int(entList[0]) and lenOfString<=int(entList[1])):
                            emptyList[numberOfWords] = entList[2]
                            numberOfWords += 1
        labels.append(emptyList)
        s = s + numberOfWords
    return text_dataset, labels

In [None]:
json_file_path = "/content/gdrive/MyDrive/ResumeRanker/Dataset/Entity Recognition in Resumes.json"
text, labels = extract_data_from_json(json_file_path)

Extracting Data    : 100%|██████████| 220/220 [00:00<00:00, 8007.11it/s]
Processing Entities: 100%|██████████| 220/220 [00:00<00:00, 36463.56it/s]
Creating Labels    : 100%|██████████| 220/220 [00:01<00:00, 215.54it/s]


In [None]:
tags_id = {"Name":0,"College Name":1,"Degree":2,"Graduation Year":3,"Years of Experience":4,"Companies worked at":5,"Designation":6,"Skills":7,"Location":8,"Email Address":9,"UNKNOWN":10,"Empty":11}

In [None]:
class NERconfig:
    def __init__(self, maxlen, batch_size, rootdir, tag_id, pretrained = True, use_scheduler = False):
        self.maxlen = maxlen
        self.batch_size = batch_size
        self.rootdir = rootdir
        self.tag_id = tag_id
        self.id_tag = dict((v,k) for k,v in self.tag_id.items())
        self.pretrained = pretrained
        self.use_scheduler = use_scheduler
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class NER_Dataset(Dataset):
    def __init__(self, text_data, entity_labels, config,  tokenizer):
        self.text_data = text_data
        self.entity_labels = entity_labels
        self.tokenizer = tokenizer
        self.config = config
        self.text = [self.tokenizer(i, add_special_tokens=False, return_tensors="pt") for i in self.text_data]
        self.labels = [self.align_label(i,j) for i,j in zip(self.text_data, self.entity_labels)]
        self.X = []
        self.Y = []
        self.restructure_data()
    
    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.Y[idx]
        return x, y

    def restructure_data(self):
        for i in range(len(self.text)):
            input_id_chunks = self.text[i]['input_ids'][0].split(self.config.maxlen-2)
            mask_chunks = self.text[i]['attention_mask'][0].split(self.config.maxlen-2)
            label_chunks = torch.Tensor(self.labels[i]).split(self.config.maxlen-2)
            for j in range(len(input_id_chunks)):
                inp_chunk = torch.cat([torch.Tensor([101]),input_id_chunks[j],torch.Tensor([102])])
                msk_chunk = torch.cat([torch.Tensor([1]), mask_chunks[j], torch.Tensor([1])])
                lab_chunk = torch.cat([torch.Tensor([self.config.tag_id["Empty"]]), label_chunks[j], torch.Tensor([self.config.tag_id["Empty"]])])
                pad_len = self.config.maxlen - inp_chunk.shape[0]
                if pad_len > 0:
                    inp_chunk = torch.cat([inp_chunk, torch.Tensor([0] * pad_len)])
                    msk_chunk = torch.cat([msk_chunk, torch.Tensor([0] * pad_len)])
                    lab_chunk = torch.cat([lab_chunk, torch.Tensor([self.config.tag_id["Empty"]] * pad_len)])
                input_dict = {
                              'input_ids': inp_chunk.long(),
                              'attention_mask': msk_chunk.int()
                              }
                self.X.append(input_dict)
                self.Y.append(lab_chunk.long())


    def align_label(self, texts, labels):
        split_tokens = texts.split()
        comp_idx = 0
        extension = 0
        tokenized_inputs = self.tokenizer(texts,add_special_tokens=False)
        word_ids = tokenized_inputs.word_ids()                                                                            
        label_ids = []
        i = 0
        while i < len(word_ids):
            word = self.tokenizer.decode(tokenized_inputs.input_ids[i-extension:i+1])
            if (word == split_tokens[comp_idx]) or ("".join(word.split()) == split_tokens[comp_idx]):
                label_ids.extend([self.config.tag_id[labels[comp_idx]]]*(extension+1))
                comp_idx += 1
                extension = 0
            elif word == '[UNK]':
                label_ids.extend([self.config.tag_id[labels[comp_idx]]]*(extension+1))
                comp_idx += 1
                extension = 0
            else:
                extension += 1
            i += 1
        return label_ids

In [None]:
class Trainer:
    def __init__(self, model, config, optimizer, tokenizer, train_dataset, test_dataset,
                 val_dataset = None):

        self.model = model
        self.optimizer = optimizer
        self.config = config
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.train_dataloader = DataLoader(self.train_dataset, batch_size = self.config.batch_size, shuffle = True)
        self.valid_dataloader = DataLoader(self.val_dataset, batch_size = self.config.batch_size, shuffle = True)
        self.test_dataloader = DataLoader(self.test_dataset, batch_size = self.config.batch_size, shuffle = True)
        self.device = self.config.device
        self.best_loss = float('inf')
        self.load_pretrained = self.config.pretrained
        self.use_scheduler = self.config.use_scheduler 
        if self.use_scheduler:
            self.scheduler = StepLR(optimizer, 1, gamma=0.9, verbose=True)

        if os.path.exists(os.path.join(self.config.rootdir,"Models","BERT_NER","BERT_NER.pth")) and self.load_pretrained:
            print("Pretrained Weights found. Loading Pretrained weights...")
            self.load_weights(os.path.join(self.config.rootdir,"Models","BERT_NER","BERT_NER.pth"))
        self.model.to(self.device)

    def train_one_epoch(self):
        train_loss = 0
        train_acc = 0
        loader1 = enumerate(tqdm(self.train_dataloader))
        for i,(X,Y) in loader1:
            train_label = Y.to(self.device)
            mask = X['attention_mask'].to(self.device)
            input_id = X['input_ids'].to(self.device)
            self.optimizer.zero_grad()
            output = self.model(input_id, mask, train_label)
            loss = output[0]
            logits = output[1]
            loss.backward()
            self.optimizer.step()
            predictions = logits.argmax(dim=2)
            acc = (predictions == train_label).float().mean()
            train_acc+=acc.item()
            train_loss+=loss.item()
        train_loss/=(i+1)
        train_acc/=(i+1)
        return train_loss,train_acc

    def valid_one_epoch(self):
        valid_loss = 0
        valid_acc = 0
        loader2 = enumerate(tqdm(self.valid_dataloader)) 
        self.model.eval()
        for i,(X,Y) in loader2:
            val_label = Y.to(self.device)
            mask = X['attention_mask'].to(self.device)
            input_id = X['input_ids'].to(self.device)
            output = self.model(input_id, mask, val_label)
            loss = output[0]
            logits = output[1]
            predictions = logits.argmax(dim=2)
            acc = (predictions == val_label).float().mean()
            valid_acc+=acc.item()
            valid_loss+=loss.item()
        valid_loss/=(i+1)
        valid_acc/=(i+1)
        self.model.train()
        return valid_loss, valid_acc

    def test(self):
        test_loss = 0
        test_acc = 0
        loader2 = enumerate(tqdm(self.valid_dataloader)) 
        self.model.eval()
        for i,(X,Y) in loader2:
            val_label = Y.to(self.device)
            mask = X['attention_mask'].to(self.device)
            input_id = X['input_ids'].to(self.device)
            output = self.model(input_id, mask, val_label)
            loss = output[0]
            logits = output[1]
            predictions = logits.argmax(dim=2)
            acc = (predictions == val_label).float().mean()
            test_acc+=acc.item()
            test_loss+=loss.item()
        test_loss/=(i+1)
        test_acc/=(i+1)
        self.model.train()
        return test_loss, test_acc

    def fit(self, num_epochs):
        train_losses = []
        train_accuracies = []
        valid_losses = []
        valid_accuracies = []
        for epoch in range(num_epochs):
            train_loss, train_acc = self.train_one_epoch()
            if self.valid_dataloader is None:
                valid_loss = 0
                valid_acc = 0
            else:
                valid_loss, valid_acc = self.valid_one_epoch()
            train_losses.append(train_loss)
            train_accuracies.append(train_acc)
            valid_losses.append(valid_loss)
            valid_accuracies.append(valid_acc)
            if self.valid_dataloader is None:
                print("epoch {} | train loss : {} | train acc : {}".format(epoch, train_loss, train_acc))
            else:
                print("epoch {} | train loss : {} | train acc : {} | valid loss : {} | valid acc : {}".format(epoch, train_loss, train_acc, valid_loss, valid_acc))
            if self.use_scheduler:
                self.scheduler.step()
            if train_loss < self.best_loss:
                self.best_loss = train_loss
                self.save_weights()
        return {"train_loss":train_losses, "train_accuracy":train_accuracies, "valid_loss":valid_losses, "valid_accuracy":valid_accuracies}

    def save_weights(self, ):
        torch.save(self.model.state_dict(), os.path.join(self.config.rootdir,"Models","BERT_NER","BERT_NER.pth"))
        print("weights saved!!")
    
    def load_weights(self, dir):
        self.model.load_state_dict(torch.load(dir, map_location=self.device))
        print("weights loaded!!")

    def predict(self, sentence):
        self.model.eval()
        predictions = []
        inputs = []
        text = self.tokenizer(sentence, add_special_tokens=False, return_tensors="pt")
        text_ = self.tokenizer(sentence, add_special_tokens=False)
        input_id_chunks = text['input_ids'][0].split(self.config.maxlen-2)
        mask_chunks = text['attention_mask'][0].split(self.config.maxlen-2)
        for j in range(len(input_id_chunks)):
            inp_chunk = torch.cat([torch.Tensor([101]),input_id_chunks[j],torch.Tensor([102])])
            msk_chunk = torch.cat([torch.Tensor([1]), mask_chunks[j], torch.Tensor([1])])
            pad_len = self.config.maxlen - inp_chunk.shape[0]
            if pad_len > 0:
                inp_chunk = torch.cat([inp_chunk, torch.Tensor([0] * pad_len)])
                msk_chunk = torch.cat([msk_chunk, torch.Tensor([0] * pad_len)])
            input_dict = {
                          'input_ids': inp_chunk.long(),
                          'attention_mask': msk_chunk.int()
                         }
            inputs.append(input_dict)
        
        for inp in inputs:
            mask = inp['attention_mask'].unsqueeze(0).to(self.device)
            input_id = inp['input_ids'].unsqueeze(0).to(self.device)
            logits = self.model(input_id, mask, None).logits
            predictions.append(logits.argmax(dim=2).tolist()[0])

        split_tokens = sentence.split()
        output_words = []
        output_entities = []
        for p in range(len(predictions)):
            output_entities.extend(predictions[p][1:-1])
        comp_idx = 0
        extension = 0
        word_ids = text_.word_ids()
        label_ids = []
        i = 0
        while i < len(word_ids):
            if word_ids[i] is not None:
                word = self.tokenizer.decode(text_.input_ids[i-extension:i+1])
                if (word == split_tokens[comp_idx]) or ("".join(word.split()) == split_tokens[comp_idx]):
                    label_ids.append(self.config.id_tag[output_entities[i]])
                    comp_idx += 1
                    extension = 0
                elif word == '[UNK]':
                    label_ids.append(self.config.id_tag[output_entities[i]])
                    comp_idx += 1
                    extension = 0
                else:
                    extension += 1
            i += 1

        entities = {}
        for i in self.config.tag_id:
            entities[i] = []
        k = 0
        prev_label = None
        while k < len(label_ids):
            curr_label = label_ids[k]
            if curr_label != prev_label:
                entities[label_ids[k]].append(split_tokens[k])
            else:
                entities[label_ids[k]][-1] = entities[label_ids[k]][-1] + " " + split_tokens[k]
            prev_label = curr_label
            k += 1
        return entities

In [None]:
val_text = text[:20]
val_labels = labels[:20]
train_text = text[20:]
train_labels = labels[20:]

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self, config):
        super(BERTClass, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(config.tag_id))
    
    def forward(self, ids, mask, labels):
        output = self.bert(ids, mask, labels = labels)
        return output

In [None]:
config = NERconfig(maxlen = 512, batch_size = 8, rootdir = ROOT_DIR, tag_id = tags_id, pretrained = True, use_scheduler = True)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
train_dataset = NER_Dataset(train_text, train_labels, config,  tokenizer) 
val_dataset = NER_Dataset(val_text, val_labels, config,  tokenizer)

BERTmodel = BERTClass(config)
optimizer = Adam(BERTmodel.parameters(), lr = 0.0001)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [None]:
trainer = Trainer(
    BERTmodel,
    config,
    optimizer,
    tokenizer,
    train_dataset,
    val_dataset,
    val_dataset
)

Adjusting learning rate of group 0 to 1.0000e-04.
Pretrained Weights found. Loading Pretrained weights...
weights loaded!!


In [None]:
# history = trainer.fit(20)

In [None]:
pred = trainer.predict(text[5])

In [None]:
for i in pred:
    if i != "Empty":
      for j in pred[i]:
          print('{:<20}{:<}'.format(i,j))

Name                Anvitha Rao Automation
College Name        Arizona State University
College Name        Ramaiah Institute
Degree              Masters in Computer Science
Degree              Bachelor of Engineering in Computer Science M S
Degree              of Technology
Companies worked at SAP Labs
Companies worked at SAP
Companies worked at SAP Labs
Skills              (1 year), C++ (Less than 1 year), Hadoop (Less than 1 year),
Skills              (Less than 1 year),
Skills              (Less than 1
Skills              Technical Skills: Programming Languages: C, C++, HTML/CSS, Java, Python, Javascript Technologies: IoT, MySQL, PostgreSQL, D3js, Hadoop and Spark, Gephi
