In [1]:
from transformers import BertModel,BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


#### Creating BERT Neural Network

In [2]:
import torch 
import torch.nn as nn

In [3]:
class ToxicityModel(nn.Module):
    def __init__(self, bert_model):
        super(ToxicityModel,self).__init__()
        
        self.bert_model = bert_model
                
        self.l1 = nn.Linear(768,256)  ## Reducing the Vector Dimension
        self.dropout = nn.Dropout(0.2)
        
        ## ['target','severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
        self.toxicity = nn.Linear(256,6)  ## 6 classes
        
        self.bert_model.train() ## Setting up bert model on training mode by default
        
    def forward(self,**kwargs):
        
        hc,_ = self.bert_model(**kwargs,return_dict = False)
        x = hc[:,0,:]
        x = self.dropout(self.l1(x))
        x = self.toxicity(x)
        
        return x
        

In [4]:
bert_model = BertModel.from_pretrained("../bert_model")

In [8]:
model = ToxicityModel(bert_model=bert_model)

In [9]:
tokenizer = BertTokenizer.from_pretrained("../bert_model",do_lower = True)

In [10]:
text = tokenizer(
                    "Hello! How are you!",padding='max_length',
                    max_length = 128,return_tensors = "pt"

)

In [11]:
out = model(**text)

In [18]:
assert out.shape == torch.Size([1,6])

### DataLoading 

In [20]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [35]:
class ToxicityDataset(Dataset):
    def __init__(self,data_path,tokenizer,max_length = 128):
        ## Initializing some variables in the constructor
        self.data = pd.read_csv(data_path)
        self.tokenizer = tokenizer
        self.max_length = 128
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        
        ## Accessing the single item
        item = self.data.iloc[idx]
        
        ## The input comment text
        comment_text = item['comment_text']
        
        ## The output labels
        toxicity = item['target_label']
        severe_toxicity = item['severe_toxicity']
        obscene = item['obscene']
        identity_attack = item['identity_attack']
        insult = item['insult']
        threat = item['threat']
        
        ## tokenizing the text
        input_tensors = tokenizer(comment_text,padding="max_length",\
                                    max_length=self.max_length,truncation=True,\
                                         return_tensors = "pt")
        
        ## Reducing a dimension for each key
        input_tensors = {k:v.squeeze(0) for k,v in input_tensors.items()}
        
        ## Processing the output labels
        labels = [toxicity,severe_toxicity,obscene,identity_attack,insult,threat]
        labels = torch.tensor(labels).long()
        
        ## returning the result
        return {"input":input_tensors,"labels":labels}

In [36]:
unittest_dataset = ToxicityDataset("../data/train_split.csv",tokenizer=tokenizer,max_length=128)

In [37]:
out = unittest_dataset[0]

In [41]:
assert out['input']['input_ids'].shape == torch.Size([128]), "Incorrect Max length generated from Dataloader"

In [44]:
assert out['labels'].shape == torch.Size([6]), "Incorrect Number of labels generated from the Dataloader"

In [46]:
# model = BertModel.from_pretrained("../bert_model/",)

OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ../bert_model/.