<a href="https://colab.research.google.com/github/priyaaa705/BE/blob/master/Priya_Roberta_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [139]:
!pip install seqeval
!pip install pytorch_pretrained_bert
!pip install transformers



In [140]:
import pandas as pd
import logging
import re
import json
import numpy as np
import torch
from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from transformers import RobertaTokenizerFast, RobertaForTokenClassification
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

In [141]:
df = pd.read_json('Entity Recognition in Resumes.json', lines=True)

In [142]:
df.head()

Unnamed: 0,content,annotation,extras
0,Abhishek Jha\nApplication Development Associat...,"[{'label': ['Skills'], 'points': [{'start': 12...",
1,Afreen Jamadar\nActive member of IIIT Committe...,"[{'label': ['Email Address'], 'points': [{'sta...",
2,"Akhil Yadav Polemaina\nHyderabad, Telangana - ...","[{'label': ['Skills'], 'points': [{'start': 37...",
3,Alok Khandai\nOperational Analyst (SQL DBA) En...,"[{'label': ['Skills'], 'points': [{'start': 80...",
4,Ananya Chavan\nlecturer - oracle tutorials\n\n...,"[{'label': ['Degree'], 'points': [{'start': 20...",


In [143]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content'].replace("\n", " ")
            entities = []
            data_annotations = data['annotation']
            if data_annotations is not None:
                for annotation in data_annotations:
                   
                    point = annotation['points'][0]
                    labels = annotation['label']
                   
                    if not isinstance(labels, list):
                        labels = [labels]

                    for label in labels:
                        point_start = point['start']
                        point_end = point['end']
                        point_text = point['text']
                        
                        lstrip_diff = len(point_text) - len(point_text.lstrip())
                        rstrip_diff = len(point_text) - len(point_text.rstrip())
                        if lstrip_diff != 0:
                            point_start = point_start + lstrip_diff
                        if rstrip_diff != 0:
                            point_end = point_end - rstrip_diff
                        entities.append((point_start, point_end + 1 , label))
            training_data.append((text, {"entities" : entities}))
        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

def trim_entity_spans(data: list) -> list:
  invalid_span_tokens = re.compile(r'\s')

  cleaned_data = []
  for text, annotations in data:
      entities = annotations['entities']
      valid_entities = []
      for start, end, label in entities:
          valid_start = start
          valid_end = end
          while valid_start < len(text) and invalid_span_tokens.match(
                  text[valid_start]):
              valid_start += 1
          while valid_end > 1 and invalid_span_tokens.match(
                  text[valid_end - 1]):
              valid_end -= 1
          valid_entities.append([valid_start, valid_end, label])
      cleaned_data.append([text, {'entities': valid_entities}])
  return cleaned_data

In [144]:
data = trim_entity_spans(convert_dataturks_to_spacy('Entity Recognition in Resumes.json'))

In [145]:
def get_label(offset, labels):
    if offset[0] == 0 and offset[1] == 0:
        return 'O'
    for label in labels:
        if offset[1] >= label[0] and offset[0] <= label[1]:
            return label[2]
    return 'O'

In [146]:
tags_vals = ["O", "Name", "Degree","Skills","College Name","Email Address","Designation","Companies worked at","Graduation Year","Years of Experience","Location"]
tag2idx = {t: i for i, t in enumerate(tags_vals)}
idx2tag = {i:t for i, t in enumerate(tags_vals)}

In [147]:
def process_resume(data, tokenizer, tag2idx, max_len, is_test=False):
    tok = tokenizer.encode_plus(data[0], max_length=max_len, return_offsets_mapping=True, truncation=True)
    curr_sent = {'orig_labels':[], 'labels': []}
    
    padding_length = max_len - len(tok['input_ids'])
    
    if not is_test:
        labels = data[1]['entities']
        labels.reverse()
        for off in tok['offset_mapping']:
            label = get_label(off, labels)
            curr_sent['orig_labels'].append(label)
            curr_sent['labels'].append(tag2idx[label])
        curr_sent['labels'] = curr_sent['labels'] + ([0] * padding_length)
    
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + ([0] * padding_length)
    return curr_sent


In [148]:
class ResumeDataset(Dataset):
    def __init__(self, resume, tokenizer, tag2idx, max_len, is_test=False):
        self.resume = resume
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.tag2idx = tag2idx
        self.max_len = max_len
    
    def __len__(self):
        return len(self.resume)
    
    def __getitem__(self, idx):
        data = process_resume(self.resume[idx], self.tokenizer, self.tag2idx, self.max_len, self.is_test)
        return {
            'input_ids': torch.tensor(data['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(data['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(data['labels'], dtype=torch.long),
            'orig_label': data['orig_labels']
        }

In [149]:
def get_hyperparameters(model, ff):

    
  if ff:
      param_optimizer = list(model.named_parameters())
      no_decay = ["bias", "gamma", "beta"]
      optimizer_grouped_parameters = [
          {
              "params": [
                  p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
              ],
              "weight_decay_rate": 0.01,
          },
          {
              "params": [
                  p for n, p in param_optimizer if any(nd in n for nd in no_decay)
              ],
              "weight_decay_rate": 0.0,
          },
      ]
  else:
      param_optimizer = list(model.classifier.named_parameters())
      optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

  return optimizer_grouped_parameters

In [150]:
def get_special_tokens(tokenizer, tag2idx):
  vocab = tokenizer.get_vocab()
  #pad_tok = vocab["[PAD]"]
  #sep_tok = vocab["[SEP]"]
  #cls_tok = vocab["[CLS]"]
  o_lab = tag2idx["O"]

  return o_lab

In [151]:
def annot_confusion_matrix(valid_tags, pred_tags):

  
 
    header = sorted(list(set(valid_tags + pred_tags)))

    matrix = confusion_matrix(valid_tags, pred_tags, labels=header)

    mat_formatted = [header[i] + "\t\t\t" + str(row) for i, row in enumerate(matrix)]
    content = "\t" + " ".join(header) + "\n" + "\n".join(mat_formatted)

    return content

In [152]:
def flat_accuracy(valid_tags, pred_tags):
  return (np.array(valid_tags) == np.array(pred_tags)).mean()

In [153]:
def train_and_save_model(
    model,
    tokenizer,
    optimizer,
    epochs,
    idx2tag,
    tag2idx,
    max_grad_norm,
    device,
    train_dataloader,
    valid_dataloader
):

  o_lab = get_special_tokens(tokenizer, tag2idx)

  epoch = 0
  for _ in trange(epochs, desc="Epoch"):
    epoch += 1

    print("Starting training loop.")
    model.train()
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []

    for step, batch in enumerate(train_dataloader):
        
        b_input_ids, b_input_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        b_input_ids, b_input_mask, b_labels = b_input_ids.to(device), b_input_mask.to(device), b_labels.to(device)

        
        outputs = model(
            b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels,
        )
        loss, tr_logits = outputs[:2]

        
        loss.backward()

      
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        
        preds_mask = (
            (b_input_ids != o_lab)
            #& (b_input_ids != pad_tok)
            #& (b_input_ids != sep_tok)
        )
        #preds_mask = 1
        tr_logits = tr_logits.cpu().detach().numpy()
        tr_label_ids = torch.masked_select(b_labels, (preds_mask==1))
        preds_mask = preds_mask.cpu().detach().numpy()
        tr_batch_preds = np.argmax(tr_logits[preds_mask.squeeze()], axis=1)
        tr_batch_labels = tr_label_ids.to("cpu").numpy()
        tr_preds.extend(tr_batch_preds)
        tr_labels.extend(tr_batch_labels)

    
        tmp_tr_accuracy = flat_accuracy(tr_batch_labels, tr_batch_preds)
        tr_accuracy += tmp_tr_accuracy

        
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=max_grad_norm
        )

        
        optimizer.step()
        model.zero_grad()

    tr_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps

    
    print(f"Train loss: {tr_loss}")
    print(f"Train accuracy: {tr_accuracy}")
    
    

    print("Starting validation loop.")

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []

    for batch in valid_dataloader:

        b_input_ids, b_input_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        b_input_ids, b_input_mask, b_labels = b_input_ids.to(device), b_input_mask.to(device), b_labels.to(device)

        with torch.no_grad():
            outputs = model(
                b_input_ids,
                attention_mask=b_input_mask,
                labels=b_labels,
            )
            tmp_eval_loss, logits = outputs[:2]

        
        preds_mask = (
            (b_input_ids != o_lab)
            #& (b_input_ids != pad_tok)
            #& (b_input_ids != sep_tok)
        )

        logits = logits.cpu().detach().numpy()
        label_ids = torch.masked_select(b_labels, (preds_mask == 1))
        preds_mask = preds_mask.cpu().detach().numpy()
        val_batch_preds = np.argmax(logits[preds_mask.squeeze()], axis=1)
        val_batch_labels = label_ids.to("cpu").numpy()
        predictions.extend(val_batch_preds)
        true_labels.extend(val_batch_labels)

        tmp_eval_accuracy = flat_accuracy(val_batch_labels, val_batch_preds)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1

    
    pred_tags = [idx2tag[i] for i in predictions]
    valid_tags = [idx2tag[i] for i in true_labels]
    cl_report = classification_report(valid_tags, pred_tags)
    conf_mat = annot_confusion_matrix(valid_tags, pred_tags)
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps

    
    #print(f"Validation loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    #print(f"Classification Report:\n {cl_report}")
    #print(f"Confusion Matrix:\n {conf_mat}")

  return(pred_tags,valid_tags,eval_accuracy)


In [154]:
MAX_LEN = 500
EPOCHS = 5
MODEL_PATH = '../input/roberta-base'
TOKENIZER = RobertaTokenizerFast.from_pretrained('roberta-base',lowercase=True)
#DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [160]:
train_data = data[:132] + data[176:]
val_data = data[132:176]
total = len(data)
MAX_GRAD_NORM = 1.0

train_d = ResumeDataset(train_data, TOKENIZER, tag2idx, MAX_LEN)
val_d = ResumeDataset(val_data, TOKENIZER, tag2idx, MAX_LEN)

train_sampler = RandomSampler(train_d)
train_dl = DataLoader(train_d, sampler=train_sampler, batch_size=8)
val_dl = DataLoader(val_d, batch_size=4)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(tag2idx))
model.to(DEVICE);
optimizer_grouped_parameters = get_hyperparameters(model, True)
optimizer = Adam(optimizer_grouped_parameters,lr=3e-5)

pred_tags, valid_tags,eval_accuracy = train_and_save_model(
  model, 
  TOKENIZER, 
  optimizer, 
  EPOCHS, 
  idx2tag, 
  tag2idx, 
  MAX_GRAD_NORM, 
  DEVICE, 
  train_dl, 
  val_dl
)

#oos_y.append(valid_tags)
#oos_pred.append(pred_tags)
#pred = np.argmax(pred_tags)
#y_compare = np.argmax(valid_tags)
#score = metrics.accuracy_score(y_compare,pred)
print(f'fold score (accuracy): {eval_accuracy}\n\n')

- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Starting training loop.
Train loss: 0.9440876651893962
Train accuracy: 0.7627678486349393
Starting validation loop.


Epoch:  20%|██        | 1/5 [00:18<01:15, 18.85s/it]

Validation Accuracy: 0.865280705035885
Starting training loop.
Train loss: 0.43012277307835495
Train accuracy: 0.8646527898961441
Starting validation loop.


Epoch:  40%|████      | 2/5 [00:37<00:56, 18.92s/it]

Validation Accuracy: 0.9028813118636015
Starting training loop.
Train loss: 0.2747299535707994
Train accuracy: 0.9116976844227905
Starting validation loop.


Epoch:  60%|██████    | 3/5 [00:57<00:38, 19.04s/it]

Validation Accuracy: 0.9137785403875239
Starting training loop.
Train loss: 0.2526176382194866
Train accuracy: 0.9125694614498517
Starting validation loop.


Epoch:  80%|████████  | 4/5 [01:16<00:19, 19.14s/it]

Validation Accuracy: 0.9228345228160136
Starting training loop.
Train loss: 0.18643423508514056
Train accuracy: 0.9379778606464387
Starting validation loop.


Epoch: 100%|██████████| 5/5 [01:36<00:00, 19.22s/it]

Validation Accuracy: 0.9256204278805732
fold score (accuracy): 0.9256204278805732







In [166]:
from google.colab import drive
drive.mount('/content/drive')
drive.mount("/content/drive", force_remount=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Mounted at /content/drive


In [167]:
path = '/content/drive/My Drive/'
torch.save(
    {
        "epoch": EPOCHS,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    },
    path + 'model_e6.tar',
)

In [168]:
model = torch.load('model_e6.tar')