### Setting filter words sytem using NLTK libary by defining Loader class architecture

In [65]:
import nltk
from nltk.corpus import words
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from gensim.models import Word2Vec
import pandas as pd
import pyarrow.parquet as pq


class Loader:
    def __init__(self,path ='./0.parquet'):
        nltk.download("words")
        self.english_words_set = set(words.words())
        self.df = pd.DataFrame()
        self.list0 = []
        self.list1 = []
        self.path = path
        
        
    def filter_lists(self):
        if(len(self.list0)==0):
            self.parquet_traverse()
        for i in range(len(self.list0)):
            paragraph_list0 = self.list0[i][0]
            label_list0 = self.list0[i][1]
            paragraph_list1 = []
            label_list1 = []
            for i in range(len(paragraph_list0)):
                word_tokens = nltk.word_tokenize(paragraph_list0[i])
                dummy_list = [word for word in word_tokens if word.lower() in self.english_words_set]
                paragraph_list1.append(dummy_list)
                word_tokens = nltk.word_tokenize(" ".join(label_list0[i]))
                dummy_list = [word for word in word_tokens if word.lower() in self.english_words_set]
                label_list1.append(dummy_list)
            self.list1.append([paragraph_list1, label_list1])
    
    def load_parquet(self ):
        try:
            table = pq.read_table(self.path)
            self.df = table.to_pandas()
        except Exception as x:
            print(x)
            return
        
    def parquet_traverse(self):
        if(len(self.df)==0):
            self.load_parquet()
        for i in range(len(self.df)):
            dict1 = self.df.iloc[i,i]
            paragraph_list1 = [i for i in dict1.keys()]
            label_list1 = [dict1[i] for i in dict1.keys()]
            label_list1 = [list(i) for i in label_list1]
            self.list0.append([paragraph_list1, label_list1])

    
    

In [66]:
data_loader = Loader(path ='./0.parquet')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [67]:
data_loader.filter_lists()

In [68]:
list1 = data_loader.list1

### Defining architecture for preparing the data for Model (model oriented architecture)
#### Included: HuggingFace

In [69]:
len(list1[0][1])

28

In [79]:
class PrepareData:
    def __init__(self):
        self.paragraphs = []
        self.highlighted_words = []
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.input_ids_tensor = torch.tensor([])
        self.attention_masks_tensor = torch.tensor([])
        self.labels_tensor = torch.tensor([])
    
    def __init__(self, paragraphs, highlighted_words):
        self.paragraphs = paragraphs
        self.highlighted_words = highlighted_words
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.input_ids_tensor = torch.tensor([])
        self.attention_masks_tensor = torch.tensor([])
        self.labels_tensor = torch.tensor([])

    def prepare_tensors(self):
        input_ids = []
        attention_masks = []
        labels = []
        for paragraph_list, highlighted_list in zip(self.paragraphs, self.highlighted_words):
            paragraph = " ".join(paragraph_list)
            tokenized_inputs = self.tokenizer.encode_plus(
                paragraph,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            input_ids.append(tokenized_inputs['input_ids'].squeeze())
            attention_masks.append(tokenized_inputs['attention_mask'].squeeze())

            label = np.zeros(tokenized_inputs['input_ids'].shape[1])
            for word in highlighted_list:
                if word in self.tokenizer.tokenize(paragraph):
                    token_indices = self.tokenizer.encode(word, add_special_tokens=False)
                    label[np.where(np.isin(tokenized_inputs['input_ids'].squeeze(), token_indices))] = 1
            labels.append(torch.tensor(label))

            self.input_ids_tensor = torch.stack(input_ids)
            self.attention_masks_tensor = torch.stack(attention_masks)
            self.labels_tensor = torch.stack(labels)
    
    def reset(self):
        self.paragraphs = []
        self.highlighted_words = []
        self.input_ids_tensor = torch.tensor([])
        self.attention_masks_tensor = torch.tensor([])
        self.labels_tensor = torch.tensor([])

In [71]:
# input_ids_tensor,
# attention_masks_tensor,  
len(labels_tensor),attention_masks_tensor

(28,
 tensor([[1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]))

In [72]:
input_ids_tensor

tensor([[ 101,  102,    0,  ...,    0,    0,    0],
        [ 101, 5237, 3330,  ...,    0,    0,    0],
        [ 101, 4162, 2671,  ...,    0,    0,    0],
        ...,
        [ 101, 5193, 3665,  ...,    0,    0,    0],
        [ 101,  102,    0,  ...,    0,    0,    0],
        [ 101, 8417, 2974,  ...,    0,    0,    0]])

In [73]:
paragraphs

[[],
 ['Agriculture',
  'engineering',
  'Air',
  'pollution',
  'dispersion',
  'terminology',
  'Engineering',
  'Fuel',
  'cell',
  'equipment',
  'Heating',
  'ventilation',
  'Metalworking',
  'terminology',
  'Mill',
  'machinery',
  'Telephony',
  'Textile',
  'Woodworking',
  'Joinery'],
 ['Applied',
  'science',
  'is',
  'the',
  'application',
  'of',
  'knowledge',
  'from',
  'one',
  'or',
  'more',
  'natural',
  'scientific',
  'to',
  'practical',
  'For',
  'example',
  'of',
  'engineering',
  'are',
  'applied',
  'Applied',
  'science',
  'is',
  'important',
  'for',
  'technology',
  'development',
  'Its',
  'use',
  'in',
  'industrial',
  'is',
  'usually',
  'to',
  'as',
  'research',
  'and',
  'development',
  'R',
  'D'],
 ['Architecture', 'and', 'Construction'],
 ['Artificial',
  'intelligence',
  'Computer',
  'Computer',
  'science',
  'and',
  'IT',
  'and',
  'IT',
  'and',
  'data',
  'Alternative',
  'for',
  'free',
  'Unified',
  'Modeling',
  'L

In [64]:
tokenizer.decode([5237])

'agriculture'

In [78]:
labels_tensor[2]

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=torch.float64)

In [80]:
d = PrepareData(list1[0][0],list1[0][1])

In [81]:
d.prepare_tensors()

In [82]:
d.labels_tensor[2]

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], dtype=torch.float64)