In [1]:
import pickle
import numpy
from bnlp import POS
from tqdm import tqdm
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import metrics

In [2]:
with open('train_sents.pickle', 'rb') as handle:
    train_sents = pickle.load(handle)

with open('dev_sents.pickle', 'rb') as handle:
    dev_sents = pickle.load(handle)

with open('test_sents.pickle', 'rb') as handle:
    test_sents = pickle.load(handle)

In [3]:
# word frequency information

def get_all_words(sents):
    words = []
    for i in range(len(sents)):
        for w in sents[i]:
            words += [w[0]]
    return words

all_train_words = get_all_words(train_sents)
len(all_train_words)

word_freq = {}
for w in all_train_words:
    if w in word_freq:
        word_freq[w] += 1
    else:
        word_freq[w] = 1

In [4]:
# Gazzeter Implementation

GRP_bn = {}
LOC_bn = {}
PER_bn = {}
PROD_bn = {}
CORP_bn = {}
CW_bn = {}


with open('gazz_main/CORP.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        CORP_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                CORP_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/CW.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        CW_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                CW_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/GRP.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        GRP_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                GRP_bn[" ".join(words[i: i+size])] = 1


with open('gazz_main/LOC.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        LOC_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                LOC_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/PER.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        PER_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                PER_bn[" ".join(words[i: i+size])] = 1


with open('gazz_main/PROD.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        PROD_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                PROD_bn[" ".join(words[i: i+size])] = 1

In [5]:
def extract_sentences_from_file(file_name):
    sentences = []
    current_sentence = ""

    with open(file_name, "r", encoding="utf-8") as file:
        for line in tqdm(file):
            line = line.strip()
            if line:
                if current_sentence:
                    current_sentence += " "
                current_sentence += line.split()[0]
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = ""

    if current_sentence:
        sentences.append(current_sentence)

    return sentences


# Example usage
dev_lines = extract_sentences_from_file('dev.txt')
test_lines = extract_sentences_from_file('test.txt')
train_lines = extract_sentences_from_file('train.txt')

11131it [00:00, 1390978.36it/s]
826917it [00:00, 1682142.80it/s]
207127it [00:00, 1513265.48it/s]


In [6]:
dev_lines

['সমস্ত বেতন নিলামের সাধারণ ব্যবহারিক উদাহরণ বিভিন্ন পেনি নিলাম / বিডিং ফি নিলাম ওয়েবসাইটে পাওয়া যাবে।',
 'সংস্করণ ৩ প্রকাশের আগে, গ্রাফিক্যাল ইউজার ইন্টারফেস সিমুলেটেড এম্প্লিফায়ার লোগো এবং ডিজাইন নির্মাণ; যাইহোক, সংস্করণ ৩, এটি একটি জেনেরিক নাম দিয়ে প্রতিস্থাপিত হয়েছিল।',
 'একটি পাইলট (হাটন দ্বারা চিত্রিত) লস অ্যাঞ্জেলেস এ অবতরণ করে কিছু ফ্লাইট সমস্যার পরে এবং শহরটিকে প্রায় নির্জন দেখায়।',
 'প্রাইমাভেরা ঝুঁকি বিশ্লেষণ - প্রকল্প ঝুঁকি বিশ্লেষণ সিমুলেশন টুল।',
 '১৮০০ -এর দশকে ম্যাটভে কাজাকভ দ্বারা ডিজাইন করা, এটি ১৩৯ সালে তার ডানা কেটে ফেলেছিল এবং রোলারগুলিতে চৌদ্দ মিটার পিছনে সরানো হয়েছিল।',
 'চারজন বিজয়ী ডিফেন্ডিং চ্যাম্পিয়ন জ্যাক নিকলাস সহ আরও পাঁচজন ছিল ২৮০ এর সমান।',
 'এটি ছিল নাটকীয় বিতরণের জন্য কারিগর বিনোদন প্রযোজিত চূড়ান্ত চলচ্চিত্র।',
 'এর মধ্যে একটি রিয়েল কানাডিয়ান সুপারস্টোর ।',
 'এরলং সুযোগ সমাধানের জন্য একক কোলন ব্যবহার করে।',
 'যদিও গির্জার সবসময় রাজকীয় পিউ থাকত, তবে গির্জায় রাজকীয়ভাবে এটিই ছিল প্রথম দেখা।',
 'দ্য ম্যাগনিফিসেন্ট অ্যাম্বারসনস (১৯৪২, সহকা

In [10]:
# BanglaBert Feature Extraction

import torch
from transformers import ElectraTokenizer, ElectraForTokenClassification, AutoConfig
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cache_dir = "cache"
model_name_or_path = "checkpoint-7500"
config = AutoConfig.from_pretrained(
    model_name_or_path,
    cache_dir=cache_dir,
    output_hidden_states = True
)   

tokenizer = ElectraTokenizer.from_pretrained(
    model_name_or_path,
)

model = ElectraForTokenClassification.from_pretrained(
    model_name_or_path,
    config=config,
    cache_dir=cache_dir,
)

def concat_hidden_states(text):
    text = tokenizer(text)
    input_ids = torch.tensor(text['input_ids']).unsqueeze(0)  # Convert to tensor and add batch dimension
    words = tokenizer.convert_ids_to_tokens(input_ids[0])
    # print(words)
    attention_mask = torch.tensor(text['attention_mask']).unsqueeze(0)  # Convert to tensor and add batch dimension

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    hidden_states = outputs.hidden_states

    # layer1 = hidden_states[0]
    # layer2 = hidden_states[1]
    # layer3 = hidden_states[2]
    # layer4 = hidden_states[3]
    # layer5 = hidden_states[4]
    # layer6 = hidden_states[5]
    # layer7 = hidden_states[6]
    # layer8 = hidden_states[7]
    # layer9 = hidden_states[8]
    # layer10 = hidden_states[9]
    # layer11 = hidden_states[10]
    # layer12 = hidden_states[11]
    # layer13 = hidden_states[12]
    # layer14 = hidden_states[13]
    # layer15 = hidden_states[14]
    # layer16 = hidden_states[15]
    # layer17 = hidden_states[16]
    # layer18 = hidden_states[17]
    # layer19 = hidden_states[18]
    # layer20 = hidden_states[19]
    # layer21 = hidden_states[20]
    # layer22 = hidden_states[21]
    # layer23 = hidden_states[22]
    layer24 = hidden_states[23]

    # hidden = [layer1, layer2, layer3, layer4, layer5, 
    #           layer6, layer7, layer8, layer9, layer10, 
    #           layer11, layer12, layer13, layer14, layer15, 
    #           layer16, layer17, layer18, layer19, layer20, 
    #           layer21, layer22, layer23, layer24]

    hidden = [layer24]
    
    concatenated_tensor = torch.cat(hidden, dim=2)

    # averaged_hidden_states = torch.mean(torch.stack(hidden), dim=0)
    concatenated_tensor = concatenated_tensor.detach().numpy()
    
    new_concatenated_tensor = []

    count = 0
    for i in concatenated_tensor[0]:
        if (words[count].startswith('#') or words[count] in ['[CLS]','[SEP]'])  == False:
            new_concatenated_tensor.append(i)
        count+=1
        
    return numpy.asarray(new_concatenated_tensor)
    # return numpy.asarray(concatenated_tensor)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
with open('train_sents.pickle', 'rb') as handle:
    train_sents = pickle.load(handle)

In [41]:
dev_bert = []
count = 0
for text in tqdm(dev_lines):
    concatenated_states = concat_hidden_states(text)
    dev_bert.append(concatenated_states)


dev_bert = numpy.array(dev_bert)
len(dev_bert)

100%|██████████| 800/800 [02:05<00:00,  6.40it/s]
  dev_bert = numpy.array(dev_bert)


800

In [11]:
train_bert = []
count = 0
for text in tqdm(train_lines):
    concatenated_states = concat_hidden_states(text)
    train_bert.append(concatenated_states)

len(train_bert)
train_bert = numpy.array(train_bert)

100%|██████████| 15300/15300 [38:08<00:00,  6.69it/s]
  train_bert = numpy.array(train_bert)


In [47]:
test_bert = []
count = 0
for text in tqdm(test_lines):
    concatenated_states = concat_hidden_states(text)
    test_bert.append(concatenated_states)

test_ber = numpy.array(test_bert)
print(len(test_bert))

 17%|█▋        | 22120/133119 [43:02<3:35:59,  8.56it/s]


KeyboardInterrupt: 

In [42]:
dev_bert[0].shape

(17, 1024)

In [62]:
train_bert = numpy.array(train_bert)

  train_bert = numpy.array(train_bert)


In [18]:
train_bert[0][11]

array([-0.14047375,  0.21264611, -1.2051288 , ...,  0.10306524,
       -0.3488621 , -0.11280416], dtype=float32)

In [43]:
global_count = 0

In [30]:
with open('kmodel.pickle', 'rb') as handle:
    kmodel = pickle.load(handle)

In [33]:
# feature extraction for Conditional Random Field - Bangla NER

def wordToFeatures(sent, bert_emb, idx):
    word = sent[idx][0]
    postag = sent[idx][1]

    # try:
    #     # bert_word_embeddings = train_bert[global_count][idx]
    #     bert_word_embeddings = dev_bert[global_count][idx]
    # except:
    #     # bert_word_embeddings = train_bert[global_count][idx]
    #     bert_word_embeddings = dev_bert[-1][-1]

    bert_cluster = kmodel.predict([bert_emb[idx]])[0]

    features = {
        'bias': 1.0,
        'word': word,
        'bert': bert_cluster,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word.isdigit': word.isdigit(),
        'index': idx,
        'length': len(word),
        'postag': postag,
        'freq': word_freq[word] if word in word_freq else 0,
        'is_corp': int(word in CORP_bn), 
        'is_cw': int(word in CW_bn), 
        'is_grp': int(word in GRP_bn), 
        'is_loc': int(word in LOC_bn), 
        'is_per': int(word in PER_bn), 
        'is_prod': int(word in PROD_bn),
    }
    
    if idx > 0:
        sub = sent[idx-1][0] + " "+ sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx < len(sent) - 1:
        sub = sent[idx][0] + " "+ sent[idx+1][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx > 1:
        sub = sent[idx-2][0] + " "+ sent[idx-1][0] + " " + sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    if idx < len(sent) - 2:
        sub = sent[idx][0] + " "+ sent[idx+1][0] + " " + sent[idx+2][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
        
    
    for i in range(1, 3):
        if idx < i:
            break
        wordi = sent[idx-i][0]
        postagi = sent[idx-i][1]
        # cluster_id_i = kmeans.predict([w2v.get_vector(wordi)])[0]
        features.update({
            '-{}:word'.format(i): wordi,
            '-{}:bert'.format(i): bert_cluster,
            '-{}:word[-3:]'.format(i): wordi[-3:],
            '-{}:word[-2:]'.format(i): wordi[-2:],
            '-{}:word[:3]'.format(i): wordi[:3],
            '-{}:word[:2]'.format(i): wordi[:2],
            '-{}:word.isdigit'.format(i): wordi.isdigit(),
            '-{}:postag'.format(i): postagi,
            '-{}:is_corp'.format(i): int(word in CORP_bn), 
            '-{}:is_cw'.format(i): int(word in CW_bn), 
            '-{}:is_grp'.format(i): int(word in GRP_bn), 
            '-{}:is_loc'.format(i): int(word in LOC_bn), 
            '-{}:is_per'.format(i): int(word in PER_bn), 
            '-{}:is_prod'.format(i): int(word in PROD_bn),
        })
    
    for i in range(1, 3):
        if (idx+i) >= len(sent):
            break
        wordi = sent[idx+i][0]
        postagi = sent[idx+i][1]
        # cluster_id_i = kmeans.predict([w2v.get_vector(wordi)])[0]
        features.update({
            '{}:word'.format(i): wordi,
            '{}:bert'.format(i): bert_cluster,
            '{}:word[-3:]'.format(i): wordi[-3:],
            '{}:word[-2:]'.format(i): wordi[-2:],
            '{}:word[:3]'.format(i): wordi[:3],
            '{}:word[:2]'.format(i): wordi[:2],
            '{}:word.isdigit'.format(i): wordi.isdigit(),
            '{}:postag'.format(i): postagi,
            '{}:is_corp'.format(i): int(word in CORP_bn), 
            '{}:is_cw'.format(i): int(word in CW_bn), 
            '{}:is_grp'.format(i): int(word in GRP_bn), 
            '{}:is_loc'.format(i): int(word in LOC_bn), 
            '{}:is_per'.format(i): int(word in PER_bn), 
            '{}:is_prod'.format(i): int(word in PROD_bn),
        })
        
    if idx == 0:
        features['BOS'] = True
    if idx == len(sent) - 1:
        features['EOS'] = True
    
    return features

def sentTofeatures(sent):
    global global_count

    sentence = dev_lines[global_count]
    # print(sentence)
    bert_word_embeddings = concat_hidden_states(sentence)
    # print(bert_word_embeddings)
    print(bert_word_embeddings.shape)

    x = [wordToFeatures(sent,bert_word_embeddings, i) for i in range(len(sent))]
    global_count+=1
    return x

def sentTolabels(sent):
    return [label for token, postag, label in sent]

In [34]:
global_count = 0

In [None]:
%%time
X_train = [sentTofeatures(s) for s in train_sents]
y_train = [sentTolabels(s) for s in train_sents]

In [35]:
%%time
X_dev = [sentTofeatures(s) for s in dev_sents]
y_dev = [sentTolabels(s) for s in dev_sents]

(17, 1024)
(28, 1024)
(22, 1024)
(10, 1024)


IndexError: index 10 is out of bounds for axis 0 with size 10

In [23]:
%%time
X_test = [sentTofeatures(s) for s in test_sents]
y_test = [sentTolabels(s) for s in test_sents]

CPU times: total: 4.58 s
Wall time: 4.58 s


In [28]:
global_count

15301

In [63]:
crf = crfsuite.CRF(
    verbose='true',
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=150,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

loading training data to CRFsuite:   0%|          | 0/15301 [00:00<?, ?it/s]


TypeError: only size-1 arrays can be converted to Python scalars

In [62]:
labels = list(crf.classes_)
labels

ValueError: Invalid model file 'C:\\Users\\User\\AppData\\Local\\Temp\\modelnakqwarr.crfsuite'

In [42]:
y_pred = crf.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='macro', labels=labels)

0.814325281001763

In [43]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='macro', labels=labels)

0.7893297754302829

In [49]:
y_test[69]

['B-CORP', 'O', 'O']

In [50]:
y_pred[69]

['B-CORP', 'I-CORP', 'O']