In [6]:
from tqdm import tqdm
import numpy as np

In [1]:
# BanglaBert Feature Extraction

import torch
from transformers import ElectraTokenizer, ElectraForTokenClassification, AutoConfig
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cache_dir = "cache"
model_name_or_path = "checkpoint-7500"
config = AutoConfig.from_pretrained(
    model_name_or_path,
    cache_dir=cache_dir,
    output_hidden_states = True
)   

tokenizer = ElectraTokenizer.from_pretrained(
    model_name_or_path,
)

model = ElectraForTokenClassification.from_pretrained(
    model_name_or_path,
    config=config,
    cache_dir=cache_dir,
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def extract_sentences_from_file(file_name):
    sentences = []
    current_sentence = ""

    with open(file_name, "r", encoding="utf-8") as file:
        for line in tqdm(file):
            line = line.strip()
            if line:
                if current_sentence:
                    current_sentence += " "
                current_sentence += line.split()[0]
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = ""

    if current_sentence:
        sentences.append(current_sentence)

    return sentences


# Example usage
dev_lines = extract_sentences_from_file('dev.txt')
test_lines = extract_sentences_from_file('test.txt')
train_lines = extract_sentences_from_file('train.txt')

11131it [00:00, 1171563.31it/s]
826917it [00:00, 1671936.97it/s]
207127it [00:00, 1517150.24it/s]


In [7]:
def concat_hidden_states(text):
    text = tokenizer(text)
    input_ids = torch.tensor(text['input_ids']).unsqueeze(0)  # Convert to tensor and add batch dimension
    words = tokenizer.convert_ids_to_tokens(input_ids[0])
    # print(words)
    attention_mask = torch.tensor(text['attention_mask']).unsqueeze(0)  # Convert to tensor and add batch dimension

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    hidden_states = outputs.hidden_states

    # layer1 = hidden_states[0]
    # layer2 = hidden_states[1]
    # layer3 = hidden_states[2]
    # layer4 = hidden_states[3]
    # layer5 = hidden_states[4]
    # layer6 = hidden_states[5]
    # layer7 = hidden_states[6]
    # layer8 = hidden_states[7]
    # layer9 = hidden_states[8]
    # layer10 = hidden_states[9]
    # layer11 = hidden_states[10]
    # layer12 = hidden_states[11]
    # layer13 = hidden_states[12]
    # layer14 = hidden_states[13]
    # layer15 = hidden_states[14]
    # layer16 = hidden_states[15]
    # layer17 = hidden_states[16]
    # layer18 = hidden_states[17]
    # layer19 = hidden_states[18]
    # layer20 = hidden_states[19]
    # layer21 = hidden_states[20]
    # layer22 = hidden_states[21]
    # layer23 = hidden_states[22]
    layer24 = hidden_states[23]

    # hidden = [layer1, layer2, layer3, layer4, layer5, 
    #           layer6, layer7, layer8, layer9, layer10, 
    #           layer11, layer12, layer13, layer14, layer15, 
    #           layer16, layer17, layer18, layer19, layer20, 
    #           layer21, layer22, layer23, layer24]

    hidden = [layer24]
    
    concatenated_tensor = torch.cat(hidden, dim=2)

    # averaged_hidden_states = torch.mean(torch.stack(hidden), dim=0)
    concatenated_tensor = concatenated_tensor.detach().numpy()
    
    new_concatenated_tensor = []

    count = 0
    for i in concatenated_tensor[0]:
        if (words[count].startswith('#') or words[count] in ['[CLS]','[SEP]'])  == False:
            new_concatenated_tensor.append(i)
        count+=1
        
    return np.asarray(new_concatenated_tensor)

In [23]:
ss = concat_hidden_states('সমস্ত বেতন নিলামের সাধারণ ব্যবহারিক উদাহরণ বিভিন্ন পেনি নিলাম / বিডিং ফি নিলাম ওয়েবসাইটে পাওয়া যাবে।')

In [24]:
ss

array([[-0.58154154,  0.6531722 , -1.1140741 , ..., -0.00872078,
        -0.22480261, -0.1402453 ],
       [-0.46665108,  0.7439235 , -1.3360343 , ...,  0.01900467,
         0.07851079, -0.12635817],
       [-0.54372823,  0.57310385, -1.187688  , ...,  0.02851821,
         0.12637195, -0.15423541],
       ...,
       [-0.54957277,  0.4999115 , -1.1502099 , ...,  0.00285783,
         0.12933405, -0.14344274],
       [-0.52617085,  0.53122383, -0.902822  , ...,  0.04445428,
         0.03610161, -0.22419369],
       [-0.27066225,  0.47984028, -0.91990006, ...,  0.1047343 ,
         0.02818093, -0.02490915]], dtype=float32)

In [26]:
huge_array = []

for i in tqdm(train_lines):
    w = concat_hidden_states(i)
    for word in w:
        huge_array.append(word)

100%|██████████| 15300/15300 [38:53<00:00,  6.56it/s]


In [37]:
len(huge_array)

216921

In [35]:
with open('huge_array_train.pickle', 'wb') as handle:
    pickle.dump(huge_array, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
huge_array[0].shape

(1024,)

In [34]:
from sklearn.cluster import KMeans
import pickle

In [29]:
num_cluster = 1000

kmodel = KMeans(n_clusters=num_cluster)
kmodel.fit(huge_array)




In [30]:
kmodel.labels_

array([989, 998, 712, ..., 746, 794, 896])

In [31]:
def get_kvalue(x):
    return kmodel.predict([x[0]])

In [33]:
with open('kmodel.pickle', 'wb') as handle:
    pickle.dump(kmodel, handle, protocol=pickle.HIGHEST_PROTOCOL)