### Setting filter words sytem using NLTK libary by defining Loader class architecture

In [65]:
import nltk
from nltk.corpus import words
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from gensim.models import Word2Vec
import pandas as pd
import pyarrow.parquet as pq


class Loader:
    def __init__(self,path ='./0.parquet'):
        nltk.download("words")
        self.english_words_set = set(words.words())
        self.df = pd.DataFrame()
        self.list0 = []
        self.list1 = []
        self.path = path
        
        
    def filter_lists(self):
        if(len(self.list0)==0):
            self.parquet_traverse()
        for i in range(len(self.list0)):
            paragraph_list0 = self.list0[i][0]
            label_list0 = self.list0[i][1]
            paragraph_list1 = []
            label_list1 = []
            for i in range(len(paragraph_list0)):
                word_tokens = nltk.word_tokenize(paragraph_list0[i])
                dummy_list = [word for word in word_tokens if word.lower() in self.english_words_set]
                paragraph_list1.append(dummy_list)
                word_tokens = nltk.word_tokenize(" ".join(label_list0[i]))
                dummy_list = [word for word in word_tokens if word.lower() in self.english_words_set]
                label_list1.append(dummy_list)
            self.list1.append([paragraph_list1, label_list1])
    
    def load_parquet(self ):
        try:
            table = pq.read_table(self.path)
            self.df = table.to_pandas()
        except Exception as x:
            print(x)
            return
        
    def parquet_traverse(self):
        if(len(self.df)==0):
            self.load_parquet()
        for i in range(len(self.df)):
            dict1 = self.df.iloc[i,i]
            paragraph_list1 = [i for i in dict1.keys()]
            label_list1 = [dict1[i] for i in dict1.keys()]
            label_list1 = [list(i) for i in label_list1]
            self.list0.append([paragraph_list1, label_list1])

    
    

In [66]:
data_loader = Loader(path ='./0.parquet')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [67]:
data_loader.filter_lists()

In [68]:
list1 = data_loader.list1

### Defining architecture for preparing the data for Model (model oriented architecture)
#### Included: HuggingFace

In [69]:
len(list1[0][1])

28

In [169]:
class PrepareData:
    def __init__(self):
        self.paragraphs = []
        self.highlighted_words = []
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.input_ids_tensor = torch.tensor([])
        self.attention_masks_tensor = torch.tensor([])
        self.labels_tensor = torch.tensor([])
    
    # def __init__(self, paragraphs, highlighted_words):
    #     self.paragraphs = paragraphs
    #     self.highlighted_words = highlighted_words
    #     self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    #     self.input_ids_tensor = torch.tensor([])
    #     self.attention_masks_tensor = torch.tensor([])
    #     self.labels_tensor = torch.tensor([])

    def prepare_tensors(self):
        input_ids = []
        attention_masks = []
        labels = []
        for paragraph_list, highlighted_list in zip(list(self.paragraphs), list(self.highlighted_words)):
            paragraph = " ".join(paragraph_list)
            tokenized_inputs = self.tokenizer.encode_plus(
                paragraph,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids.append(tokenized_inputs['input_ids'].squeeze())
            attention_masks.append(tokenized_inputs['attention_mask'].squeeze())

            label = np.zeros(tokenized_inputs['input_ids'].shape[1])
            for word in highlighted_list:
                if word in self.tokenizer.tokenize(paragraph):
                    token_indices = self.tokenizer.encode(word, add_special_tokens=False)
                    label[np.where(np.isin(tokenized_inputs['input_ids'].squeeze(), token_indices))] = 1
            labels.append(torch.tensor(label))

            self.input_ids_tensor = torch.stack(input_ids)
            self.attention_masks_tensor = torch.stack(attention_masks)
            self.labels_tensor = torch.stack(labels)
    
    def reset(self):
        self.paragraphs = []
        self.highlighted_words = []
        self.input_ids_tensor = torch.tensor([])
        self.attention_masks_tensor = torch.tensor([])
        self.labels_tensor = torch.tensor([])

In [190]:
d = PrepareData()
input_ids = torch.tensor([])
attention_masks = torch.tensor([])
labels = torch.tensor([])
for i in range(len(list1)):
    d.paragraphs = list1[i][0]
    d.highlighted_words = list1[i][1]
    d.prepare_tensors()
    print(d.input_ids_tensor)
    input_ids=torch.cat((input_ids,d.input_ids_tensor),dim=0)
    attention_masks=torch.cat((attention_masks,d.attention_masks_tensor),dim=0)
    labels = torch.cat((labels,d.labels_tensor),dim=0)
    d.reset()

tensor([[ 101,  102,    0,  ...,    0,    0,    0],
        [ 101, 5237, 3330,  ...,    0,    0,    0],
        [ 101, 4162, 2671,  ...,    0,    0,    0],
        ...,
        [ 101, 5193, 3665,  ...,    0,    0,    0],
        [ 101,  102,    0,  ...,    0,    0,    0],
        [ 101, 8417, 2974,  ...,    0,    0,    0]])
tensor([[ 101,  102,    0,  ...,    0,    0,    0],
        [ 101, 2529, 2606,  ...,    0,    0,    0],
        [ 101, 1037, 4677,  ...,    0,    0,    0],
        ...,
        [ 101, 2096, 2116,  ...,    0,    0,    0],
        [ 101, 2096, 2053,  ...,    0,    0,    0],
        [ 101, 2306, 1996,  ...,    0,    0,    0]])
tensor([[  101,   102,     0,  ...,     0,     0,     0],
        [  101, 19061,  2047,  ..., 20648,   102,     0],
        [  101, 14412,  6679,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  2744,  ...,     0,     0,     0],
        [  101,  2802,  1996,  ...,     0,     0,     0],
        [  101,  2096,  6967,  ...,     0,   

In [191]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.BCEWithLogitsLoss() 

num_epochs = 1
num_classes = 1 # ?
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(input_ids, attention_masks)
    last_hidden_state = outputs.last_hidden_state

    classifier_layer = torch.nn.Linear(last_hidden_state.shape[-1], num_classes)
    logits = classifier_layer(last_hidden_state)
    loss = loss_fn(logits.view(-1), labels.view(-1).float())
    loss.backward()
    optimizer.step()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [98]:
len(d.labels_tensor.view(-1).float())

3584

In [133]:
d.input_ids_tensor,d.input_ids_tensor.shape

(tensor([[ 101,  102,    0,  ...,    0,    0,    0],
         [ 101, 5237, 3330,  ...,    0,    0,    0],
         [ 101, 4162, 2671,  ...,    0,    0,    0],
         ...,
         [ 101, 5193, 3665,  ...,    0,    0,    0],
         [ 101,  102,    0,  ...,    0,    0,    0],
         [ 101, 8417, 2974,  ...,    0,    0,    0]]),
 torch.Size([28, 128]))

In [132]:

d.attention_masks_tensor,d.attention_masks_tensor.shape

(tensor([[1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 torch.Size([28, 128]))

In [109]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.1824,  0.1586, -0.5024,  ..., -0.1377,  0.4758,  0.2198],
         [ 0.2908,  0.7439,  0.1100,  ..., -0.1500,  0.4059, -0.2830],
         [ 0.5167,  0.4700,  0.0084,  ..., -0.2805,  0.1692,  0.1640],
         ...,
         [ 0.3771,  0.3160,  0.2751,  ..., -0.1222, -0.0376, -0.1603],
         [ 0.2608,  0.2487,  0.0208,  ..., -0.0359, -0.0027,  0.0102],
         [ 0.2529,  0.2428, -0.0207,  ..., -0.0156,  0.0306,  0.0073]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)

In [110]:
last_hidden_state = outputs.last_hidden_state

classifier_layer = torch.nn.Linear(last_hidden_state.shape[-1], num_classes)
logits = classifier_layer(last_hidden_state)

In [126]:
len((logits[0]))

128

In [118]:
logits

tensor(0.2080, grad_fn=<SelectBackward0>)
tensor(-0.1619, grad_fn=<SelectBackward0>)
tensor(-0.2951, grad_fn=<SelectBackward0>)
tensor(-0.0688, grad_fn=<SelectBackward0>)
tensor(-0.1373, grad_fn=<SelectBackward0>)
tensor(-0.1651, grad_fn=<SelectBackward0>)
tensor(-0.1700, grad_fn=<SelectBackward0>)
tensor(-0.3186, grad_fn=<SelectBackward0>)
tensor(-0.3458, grad_fn=<SelectBackward0>)
tensor(-0.3280, grad_fn=<SelectBackward0>)
tensor(-0.3037, grad_fn=<SelectBackward0>)
tensor(-0.3199, grad_fn=<SelectBackward0>)
tensor(-0.3452, grad_fn=<SelectBackward0>)
tensor(-0.4444, grad_fn=<SelectBackward0>)
tensor(-0.2095, grad_fn=<SelectBackward0>)
tensor(-0.3288, grad_fn=<SelectBackward0>)
tensor(-0.0951, grad_fn=<SelectBackward0>)
tensor(-0.1396, grad_fn=<SelectBackward0>)
tensor(-0.0310, grad_fn=<SelectBackward0>)
tensor(-0.0336, grad_fn=<SelectBackward0>)
tensor(-0.0515, grad_fn=<SelectBackward0>)
tensor(-0.0387, grad_fn=<SelectBackward0>)
tensor(-0.0383, grad_fn=<SelectBackward0>)
tensor(-0.10

In [121]:
tokenizer.decode([12945])

'automotive'

In [128]:
len(d.input_ids_tensor[5:6][0])

128

In [None]:
d