In [8]:
class LangModelWithDense(nn.Module):
    def __init__(self, lang_model, vocab_size, input_size, hidden_size, fine_tune):
        super(LangModelWithDense, self).__init__()
        self.lang_model = lang_model
        self.lang_model.resize_token_embeddings(vocab_size + 2 if fine_tune else vocab_size)

        self.linear1 = nn.Linear(input_size, hidden_size)
        self.dropout1 = nn.Dropout(0.8 if fine_tune else 0.1)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.dropout2 = nn.Dropout(0.8 if fine_tune else 0.1)
        self.linear3 = nn.Linear(hidden_size, 1)

        self.fine_tune = fine_tune

    def forward(self, x, mask):
        if self.fine_tune:
            embeddings = self.lang_model(x, attention_mask=mask)[0]
        else:
            with torch.no_grad():
                self.lang_model.eval()
                embeddings = self.lang_model(x, attention_mask=mask)[0]

        if "xlnet" in str(type(self.lang_model)):
            embeddings = embeddings[:, 0, :]
        else:
            embeddings = torch.mean(embeddings, dim=1)

        output = self.dropout1(F.gelu(self.linear1(embeddings)))
        output = self.dropout2(F.gelu(self.linear2(output)))
        output = torch.sigmoid(self.linear3(output))

        return output

In [9]:
def clean_data(df, tokenizer, fine_tune):
    df[0] = df[0].apply(lambda x: re.sub("\[ ?link ?\][a-z]?( \( [a-z] \))?", "<link>" if fine_tune else tokenizer.unk_token, x))

    df[0] = df[0].apply(lambda x: re.sub(r" ?https?:.+(\)|/|(\.pdf)|(\.PDF)|(\.html)|#| - U |aspx?|-[a-zA-z0-9]+|\.htm|\?.+)", "", x))
    df[0] = df[0].apply(lambda x: re.sub(r"www.+?( |\))", "", x))
    df[0] = df[0].apply(lambda x: x.replace(".  .", ".").replace(". .", ".").replace(", .", "."))

    df[0] = df[0].apply(lambda x: x.replace("“ ", "\"").replace(" ”", "\"").replace("’", "'").replace("‘", "'").replace(",", ",").replace("⋅", "*"))

    df[0] = df[0].apply(lambda x: re.sub(r" size 12.+}", "", x))
    df[0] = df[0].apply(lambda x: re.sub(r"5 \" MeV/\"c.+}", "", x))
    df[0] = df[0].apply(lambda x: re.sub(r" } { }", "", x))

    df[0] = df[0].apply(lambda x: re.sub(r"[^\s]+(\+|=|Δ|\*){1}[^\s]+", "<equation>" if fine_tune else tokenizer.unk_token, x))

    df[0] = df[0].apply(lambda x: re.sub(r"^ (\d+ . )?", "", x))

    df[0] = df[0].apply(lambda x: x.replace("do n't", "don't").replace("Do n't", "Don't"))

    df[0] = df[0].apply(lambda x: x.replace(" .", "."))
    df[0] = df[0].apply(lambda x: x.replace(" ,", ","))
    df[0] = df[0].apply(lambda x: x.replace(" ?", "?"))
    df[0] = df[0].apply(lambda x: x.replace(" - ", "-"))
    df[0] = df[0].apply(lambda x: x.replace("( ", "("))
    df[0] = df[0].apply(lambda x: x.replace(" )", ")"))
    df[0] = df[0].apply(lambda x: x.replace(" & ", "&"))
    df[0] = df[0].apply(lambda x: x.replace(" ;", ";"))
    df[0] = df[0].apply(lambda x: x.replace(" '", "'"))
    df[0] = df[0].apply(lambda x: x.replace(" :", ":"))
    df[0] = df[0].apply(lambda x: x.replace(" $", "$"))
    df[0] = df[0].apply(lambda x: x.replace(" %", "%"))
    df[0] = df[0].apply(lambda x: re.sub(r"(_ )+", "", x))
    df[0] = df[0].apply(lambda x: x.replace(",\"", "\""))
    
    return df

In [10]:
def process_test_sentences(df, tokenizer, fine_tune, device):
    mask = []

    df = clean_data(df, tokenizer, fine_tune)[0]

    X = df.values

    tokens = []
    for i in range(X.shape[0]):
        t = torch.tensor(tokenizer.encode(X[i], add_special_tokens=True))
        tokens.append(t)
        mask.append(torch.ones_like(t))

    X = torch.nn.utils.rnn.pad_sequence(tokens, batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
    mask = pad_sequence(mask, batch_first=True, padding_value=0).to(device)

    # X_list.append(torch.tensor(X))

    return X, mask

In [11]:
from transformers import RobertaTokenizer, RobertaModel
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd
import torch
import math
import re
import os

lang_model_name = 'roberta-base'
device = torch.device('cuda')

tokenizer = RobertaTokenizer.from_pretrained(lang_model_name)
lang_model = RobertaModel.from_pretrained(lang_model_name)

model = LangModelWithDense(lang_model, len(tokenizer), 768, 512, True).to(device)

model.load_state_dict(torch.load("./fine-tuned/model-finetuned_new.pth"))

a = [["This NER work is just a small part of the work we are doing at the UN Emerging Technology Lab (ETL) to accelerate the UNs mission.", None],
    ["A one time offline processing creates a mapping from the sets of context independent signatures harvested from BERTs vocabulary to a single descriptor/label.", None],
    ["BERTs cased vocabulary is a mixture of common nouns, proper nouns, subwords and symbols. The resulting set of 21,418 terms a mixture of common nouns and proper nouns serve as descriptors characterizing an entity type.", None],
    ["A tokenizer is responsible for preparing the text inputs as inputs to the transformer model", None],
    ["NERDA is open-sourced and available on the Python Package Index (PyPI). It can be installed with:", None],
    ["We will use the English CoNLL-2003 data set with NER annotations for training and validation of our model.", None],
    ["Recurrent Neural Networks (RNN) are designed to work with sequential data. Sequential data(can be time-series) can be in form of text, audio, video etc.", None],
    ["RNN uses the previous information in the sequence to produce the current output. To understand this better Im taking an example sentence.", None],
    ["The workflow of GRU is same as RNN but the difference is in the operations inside the GRU unit. Lets see the architecture of it.", None],
    ["In the LSTM layer, I used 5 neurons and it is the first layer (hidden layer) of the neural network, so the input_shape is the shape of the input which we will pass.", None],
    ["A Recurrent Neural Network is a network with a loop. It processes information sequentially and the output from every time step is fed back to the network which acts as a sort of memory.", None],
    ["the concatenated vector is passed through a fully connected layer to get a new vector. This vector is passed through the softmax activation.", None]]

test_df = pd.DataFrame(a)
tokens, masks = process_test_sentences(test_df, tokenizer, True, device)

output = model.forward(tokens, masks)
pred = torch.tensor([0 if x < 0.5 else 1 for x in output])
print(pred)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0])
