In [1]:
import pickle
from bnlp import POS
from tqdm import tqdm
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import metrics

In [2]:
def load_data(filename):
    with open(filename, 'r',encoding='utf-8') as f:
        data = [line.strip().split(' _ _ ') for line in f.readlines()]
        # return data
        # empty lines are used to separate sentences
        # separate them into sentences
        sentences = []
        cur = []
        for line in data:
            if line == ['']:
                cur = [tuple(line) for line in cur]
                sentences.append(cur)
                cur = []
            else:
                # for p in punctuations:
                #     line[0] = line[0].replace(p, '')
                if len(line[0]) == 0:
                    line[0] = ' '
                cur.append(line)
        # # convert each list to a tuple
        sentences.append(cur)
        return sentences

In [3]:
train = load_data('train.txt')
dev = load_data('dev.txt')
test = load_data('test.txt')

In [4]:
!git clone https://github.com/sagorbrur/bnlp

fatal: destination path 'bnlp' already exists and is not an empty directory.


In [5]:
bn_pos = POS()
model_path = "bnlp/model/bn_pos.pkl"

In [6]:
def add_pos(sentence):
    # first one of the tuple is the word and the second one is the ner  
    words = []
    for word in sentence:
        words.append(word[0])
    pos = bn_pos.tag(model_path, words)
    ## add ner back to the tuple
    ret = []
    for i in range(len(pos)):
        word = sentence[i][0]
        ner = sentence[i][1]
        ret.append((word, pos[i][1], ner))
    return ret

def add_pos_to_all(sents):
    ret = []
    for i in tqdm(range(len(sents))):
        # if i % 1000 == 0:
        #     print(i)
        ret.append(add_pos(sents[i]))
    return ret

In [7]:
train_sents =  add_pos_to_all(train)
train_sents.pop()

  1%|▏         | 216/15301 [00:02<02:26, 102.94it/s]


KeyboardInterrupt: 

In [17]:
train_sents[0]

[('২০১৮', 'RDF', 'O'),
 ('এর', 'PPR', 'O'),
 ('সেরা', 'PPR', 'O'),
 ('(বর্ণানুক্রমিকভাবে', 'AMN', 'O'),
 ('তালিকাভুক্ত,', 'NC', 'O'),
 ('র\u200d্যাঙ্ক', 'NC', 'O'),
 ('করা', 'NV', 'O'),
 ('হয়নি),', 'VM', 'O'),
 ('এনপিআর', 'PU', 'B-CORP')]

In [9]:
dev_sents = add_pos_to_all(dev)

100%|██████████| 801/801 [00:06<00:00, 123.39it/s]


In [16]:
dev_sents[0]

[('সমস্ত', 'JQ', 'O'),
 ('বেতন', 'JJ', 'O'),
 ('নিলামের', 'NC', 'O'),
 ('সাধারণ', 'JJ', 'O'),
 ('ব্যবহারিক', 'JJ', 'O'),
 ('উদাহরণ', 'NC', 'O'),
 ('বিভিন্ন', 'JJ', 'O'),
 ('পেনি', 'NC', 'O'),
 ('নিলাম', 'VM', 'O'),
 ('/', 'PU', 'O'),
 ('বিডিং', 'NC', 'B-CORP'),
 ('ফি', 'NC', 'I-CORP'),
 ('নিলাম', 'VM', 'I-CORP'),
 ('ওয়েবসাইটে', 'VAUX', 'O'),
 ('পাওয়া', 'NC', 'O'),
 ('যাবে।', 'PU', 'O')]

In [11]:
test_sents = add_pos_to_all(test)

100%|██████████| 133120/133120 [17:11<00:00, 129.01it/s]


In [12]:
test_sents[0]

[('প্রিয়া', 'NC', 'B-PER'),
 ('ঝিঙ্গান', 'NC', 'I-PER'),
 ('ক্যালোরি', 'NC', 'O')]

In [13]:
with open('saved_pickles/train_sents_for_final.pickle', 'wb') as handle:
    pickle.dump(train_sents, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('saved_pickles/dev_sents_for_final.pickle', 'wb') as handle:
    pickle.dump(dev_sents, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('test_sents_for_final.pickle', 'wb') as handle:
    pickle.dump(test_sents, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
with open(r'E:\Thesis MAIN CRF\saved_pickles\train_sents_for_final.pickle', 'rb') as handle:
    train_sents = pickle.load(handle)

with open(r'E:\Thesis MAIN CRF\saved_pickles\dev_sents_for_final.pickle', 'rb') as handle:
    dev_sents = pickle.load(handle)

with open(r'test_main_sents.pickle', 'rb') as handle:
    test_sents = pickle.load(handle)

In [9]:
test_sents

[[('প্রিয়া', 'NC', 'B-PER'),
  ('ঝিঙ্গান', 'NC', 'I-PER'),
  ('ক্যালোরি', 'NC', 'O')],
 [('টিক্কা', 'NC', 'B-PROD'),
  ('(খাবার)', 'NC', 'I-PROD'),
  ('নিয়ামক', 'JJ', 'O'),
  ('ষ্টলেন', 'VM', 'B-PROD'),
  ('এর', 'PPR', 'O'),
  ('সাথে', 'PP', 'O'),
  ('কিভাবে', 'AMN', 'O'),
  ('সিঙ্ক', 'NC', 'O'),
  ('করবেন', 'VM', 'O')],
 [('পেটিকোট', 'NC', 'B-PROD'),
  ('এর', 'PPR', 'O'),
  ('জন্য', 'PP', 'O'),
  ('সঠিক', 'JJ', 'O'),
  ('তাপমাত্রা', 'NC', 'O')],
 [('টপ', 'NC', 'B-PROD'), ('তথ্য', 'NC', 'O')],
 [('প্রবর্তক', 'JJ', 'B-GRP'),
  ('সংঘ', 'NC', 'I-GRP'),
  ('এর', 'PPR', 'O'),
  ('ছবি', 'NC', 'O')],
 [('সব', 'JQ', 'O'), ('১৯৮৪', 'RDF', 'B-CW'), ('অক্ষর', 'NC', 'O')],
 [('টিলাগড়', 'NP', 'B-LOC'),
  ('এ', 'DAB', 'O'),
  ('সময়', 'NC', 'O'),
  ('কত', 'JQ', 'O')],
 [('২০০৮', 'RDF', 'B-GRP'),
  ('গ্রীষ্মকালীন', 'JJ', 'I-GRP'),
  ('অলিম্পিকে', 'NC', 'I-GRP'),
  ('ইরিত্রিয়া', 'NC', 'I-GRP'),
  ('একটি', 'JQ', 'O'),
  ('ভাল', 'JJ', 'O'),
  ('স্কুল', 'NC', 'O')],
 [('বৈদ্যুতিক', 'JJ', 'B-PROD'),
 

In [10]:
# word frequency information

def get_all_words(sents):
    words = []
    for i in range(len(sents)):
        for w in sents[i]:
            words += [w[0]]
    return words

all_train_words = get_all_words(train_sents)
len(all_train_words)

word_freq = {}
for w in all_train_words:
    if w in word_freq:
        word_freq[w] += 1
    else:
        word_freq[w] = 1

In [11]:
# Gazzeter Implementation

GRP_bn = {}
LOC_bn = {}
PER_bn = {}
PROD_bn = {}
CORP_bn = {}
CW_bn = {}


with open('gazz_main/CORP.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        CORP_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                CORP_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/CW.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        CW_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                CW_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/GRP.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        GRP_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                GRP_bn[" ".join(words[i: i+size])] = 1


with open('gazz_main/LOC.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        LOC_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                LOC_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/PER.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        PER_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                PER_bn[" ".join(words[i: i+size])] = 1


with open('gazz_main/PROD.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        PROD_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                PROD_bn[" ".join(words[i: i+size])] = 1

In [12]:
import string
punctuation = string.punctuation + '।'

In [13]:
def ispunct(word):
    word = word.strip()
    if len(word) == 1 and any(char in punctuation for char in word):
        return 1
    else:
        return 0

In [14]:
ispunct("; ")

1

In [15]:
# BanglaBert Feature Extraction

import torch
from transformers import ElectraTokenizer, ElectraForTokenClassification, AutoConfig
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

cache_dir = "cache"
model_name_or_path = "checkpoint-7500"
config = AutoConfig.from_pretrained(
    model_name_or_path,
    cache_dir=cache_dir,
    output_hidden_states = True
)   

tokenizer = ElectraTokenizer.from_pretrained(
    model_name_or_path,
)

model = ElectraForTokenClassification.from_pretrained(
    model_name_or_path,
    config=config,
    cache_dir=cache_dir,
)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def concat_hidden_states(text):
    temp = ''
    for i in text:
        if i not in string.punctuation:
            temp+=i


    text = tokenizer(temp)
    input_ids = torch.tensor(text['input_ids']).unsqueeze(0)  # Convert to tensor and add batch dimension
    words = tokenizer.convert_ids_to_tokens(input_ids[0])
    attention_mask = torch.tensor(text['attention_mask']).unsqueeze(0)  # Convert to tensor and add batch dimension

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    hidden_states = outputs.hidden_states

    # layer1 = hidden_states[0]
    # layer2 = hidden_states[1]
    # layer3 = hidden_states[2]
    # layer4 = hidden_states[3]
    # layer5 = hidden_states[4]
    # layer6 = hidden_states[5]
    # layer7 = hidden_states[6]
    # layer8 = hidden_states[7]
    # layer9 = hidden_states[8]
    # layer10 = hidden_states[9]
    # layer11 = hidden_states[10]
    # layer12 = hidden_states[11]
    # layer13 = hidden_states[12]
    # layer14 = hidden_states[13]
    # layer15 = hidden_states[14]
    # layer16 = hidden_states[15]
    # layer17 = hidden_states[16]
    # layer18 = hidden_states[17]
    # layer19 = hidden_states[18]
    # layer20 = hidden_states[19]
    # layer21 = hidden_states[20]
    # layer22 = hidden_states[21]
    # layer23 = hidden_states[22]
    layer24 = hidden_states[23]

    # hidden = [layer1, layer2, layer3, layer4, layer5, 
    #           layer6, layer7, layer8, layer9, layer10, 
    #           layer11, layer12, layer13, layer14, layer15, 
    #           layer16, layer17, layer18, layer19, layer20, 
    #           layer21, layer22, layer23, layer24]

    hidden = [layer24]
    
    concatenated_tensor = torch.cat(hidden, dim=2)

    # averaged_hidden_states = torch.mean(torch.stack(hidden), dim=0)
    concatenated_tensor = concatenated_tensor.detach().numpy()
    
    new_concatenated_tensor = []

    count = 0
    for i in concatenated_tensor[0]:
        if (words[count].startswith('#') or words[count] in ['[CLS]','[SEP]'])  == False:
            new_concatenated_tensor.append(i)
        count+=1
        
    return numpy.asarray(new_concatenated_tensor)

In [16]:
text = tokenizer('প্রাইমাভেরা ঝুঁকি বিশ্লেষণ-রকল্প ঝুঁকি বিশ্লেষণ সিমুলেশন টুল।')
input_ids = torch.tensor(text['input_ids']).unsqueeze(0)  # Convert to tensor and add batch dimension
words = tokenizer.convert_ids_to_tokens(input_ids[0])
attention_mask = torch.tensor(text['attention_mask']).unsqueeze(0)  # Convert to tensor and add batch dimension

outputs = model(input_ids=input_ids, attention_mask=attention_mask)
# outputs['logits'].shape
torch.argmax(outputs['logits'], dim = 2).numpy()

array([[ 5,  1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  5]],
      dtype=int64)

In [17]:
def extract_sentences_from_file(file_name):
    sentences = []
    current_sentence = ""

    with open(file_name, "r", encoding="utf-8") as file:
        for line in tqdm(file):
            line = line.strip()
            if line:
                if current_sentence:
                    current_sentence += " "
                current_sentence += line.split()[0]
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = ""

    if current_sentence:
        sentences.append(current_sentence)

    return sentences


# Example usage
dev_lines = extract_sentences_from_file('dev.txt')
test_lines = extract_sentences_from_file('test.txt')
train_lines = extract_sentences_from_file('train.txt')

11131it [00:00, 1309330.51it/s]
826917it [00:00, 1656856.73it/s]
207127it [00:00, 1528330.70it/s]


In [18]:
import numpy as np

In [19]:
def bert_output(sentence):
    text = tokenizer(sentence)
    input_ids = torch.tensor(text['input_ids']).unsqueeze(0)  # Convert to tensor and add batch dimension
    words = tokenizer.convert_ids_to_tokens(input_ids[0])
    attention_mask = torch.tensor(text['attention_mask']).unsqueeze(0)  # Convert to tensor and add batch dimension
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    outputs =  torch.argmax(outputs['logits'], dim = 2).numpy()

    count = 0
    ans = []
    w = tokenizer.tokenize(sentence)
    for word in sentence.split():
        while True:
            try:
                if word.startswith(w[count]) or w[count] == '[UNK]':
                    ans.append(outputs[0][count])
                    break
            
                else:
                    count+=1
                
                if count == len(w):
                    break
            except:
                ans = [12]*len(sentence.split())
                break

    return np.array(ans)

In [20]:
bert_output(dev_lines[319])

array([12, 12, 12, 12, 12, 12, 12, 12, 12])

In [21]:
tokenizer.tokenize(dev_lines[319])

['গানটি',
 'প্রযোজনা',
 'করেছেন',
 'এবং',
 'এতে',
 'র্যা',
 '##পার',
 'জের',
 '##মাই',
 '##ন',
 'ডু',
 '##পরি',
 'রয়েছে',
 '।']

In [21]:
dev_lines[319].split()

['গানটি',
 'প্রযোজনা',
 'করেছেন',
 'এবং',
 'এতে',
 'র\u200d্যাপার',
 'জেরমাইন',
 'ডুপরি',
 'রয়েছে।']

In [22]:
print(*dev_lines[56].split(), sep = '|')

লি|সে-ডল|৯|ড্যান|-|১৮|টি|আন্তর্জাতিক|শিরোপা|জয়ী।


In [23]:
tokenizer.tokenize(dev_lines[4])

['১৮০',
 '##০',
 '-',
 'এর',
 'দশকে',
 'ম্যাট',
 '##ভে',
 'কাজ',
 '##াক',
 '##ভ',
 'দ্বারা',
 'ডিজাইন',
 'করা',
 ',',
 'এটি',
 '১৩',
 '##৯',
 'সালে',
 'তার',
 'ডানা',
 'কেটে',
 'ফেলেছিল',
 'এবং',
 'রোল',
 '##ার',
 '##গুলিতে',
 'চৌদ্দ',
 'মিটার',
 'পিছনে',
 'সরানো',
 'হয়েছিল',
 '।']

In [22]:
# Feature Extraction

def wordToFeatures(sent, bert_out, idx):
    word = sent[idx][0]
    postag = sent[idx][1]
    try:
        bert = bert_out[idx]
    except:
        bert = 12

    features = {
        'bias': 1.0,
        'word': word,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-1:]': word[-1:],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[:1]': word[:1],
        'word.isdigit': word.isdigit(),
        'word.ispunctuation': ispunct(word),
        'index': idx,
        'length': len(word),
        'postag': postag,
        'freq': word_freq[word] if word in word_freq else 0,
        'is_corp': int(word in CORP_bn), 
        'is_cw': int(word in CW_bn), 
        'is_grp': int(word in GRP_bn), 
        'is_loc': int(word in LOC_bn), 
        'is_per': int(word in PER_bn), 
        'is_prod': int(word in PROD_bn),
        'bert': bert,
    }
    
    if idx > 0:
        sub = sent[idx-1][0] + " "+ sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx < len(sent) - 1:
        sub = sent[idx][0] + " "+ sent[idx+1][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx > 1:
        sub = sent[idx-2][0] + " "+ sent[idx-1][0] + " " + sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 3
        if sub in CW_bn:
            features['is_cw'] = 3
        if sub in GRP_bn:
            features['is_grp'] = 3
        if sub in LOC_bn:
            features['is_loc'] = 3
        if sub in PER_bn:
            features['is_per'] = 3
        if sub in PROD_bn:
            features['is_prod'] = 3
    if idx < len(sent) - 2:
        sub = sent[idx][0] + " "+ sent[idx+1][0] + " " + sent[idx+2][0]
        if sub in CORP_bn:
            features['is_corp'] = 3
        if sub in CW_bn:
            features['is_cw'] = 3
        if sub in GRP_bn:
            features['is_grp'] = 3
        if sub in LOC_bn:
            features['is_loc'] = 3
        if sub in PER_bn:
            features['is_per'] = 3
        if sub in PROD_bn:
            features['is_prod'] = 3
        
    
    for i in range(1, 3):
        if idx < i:
            break
        wordi = sent[idx-i][0]
        postagi = sent[idx-i][1]
        try:
            berti = bert_out[idx-i]
        except:
            berti = 12

        features.update({
            '-{}:word'.format(i): wordi,
            '-{}:word[-3:]'.format(i): wordi[-3:],
            '-{}:word[-2:]'.format(i): wordi[-2:],
            '-{}:word[-1:]'.format(i): wordi[-1:],
            '-{}:word[:3]'.format(i): wordi[:3],
            '-{}:word[:2]'.format(i): wordi[:2],
            '-{}:word[:1]'.format(i): wordi[:1],
            '-{}:word.isdigit'.format(i): wordi.isdigit(),
            '-{}:word.ispunctuation'.format(i): ispunct(wordi),
            '-{}:postag'.format(i): postagi,
            '-{}:is_corp'.format(i): int(wordi in CORP_bn), 
            '-{}:is_cw'.format(i): int(wordi in CW_bn), 
            '-{}:is_grp'.format(i): int(wordi in GRP_bn), 
            '-{}:is_loc'.format(i): int(wordi in LOC_bn), 
            '-{}:is_per'.format(i): int(wordi in PER_bn), 
            '-{}:is_prod'.format(i): int(wordi in PROD_bn),
            '-{}:is_prod'.format(i): int(wordi in PROD_bn),
            '-{}:bert'.format(i): berti,
        })
    
    for i in range(1, 3):
        if (idx+i) >= len(sent):
            break
        wordi = sent[idx+i][0]
        postagi = sent[idx+i][1]
        try:
            berti = bert_out[idx+i]
        except:
            berti = 12

        features.update({
            '{}:word'.format(i): wordi,
            '{}:word[-3:]'.format(i): wordi[-3:],
            '{}:word[-2:]'.format(i): wordi[-2:],
            '{}:word[-1:]'.format(i): wordi[-1:],
            '{}:word[:3]'.format(i): wordi[:3],
            '{}:word[:2]'.format(i): wordi[:2],
            '{}:word[:1]'.format(i): wordi[:1],
            '{}:word.isdigit'.format(i): wordi.isdigit(),
            '{}:word.ispunctuation'.format(i): ispunct(wordi),
            '{}:postag'.format(i): postagi,
            '{}:is_corp'.format(i): int(wordi in CORP_bn), 
            '{}:is_cw'.format(i): int(wordi in CW_bn), 
            '{}:is_grp'.format(i): int(wordi in GRP_bn), 
            '{}:is_loc'.format(i): int(wordi in LOC_bn), 
            '{}:is_per'.format(i): int(wordi in PER_bn), 
            '{}:is_prod'.format(i): int(wordi in PROD_bn),
            '{}:bert'.format(i): berti,
        })
        
    if idx == 0:
        features['BOS'] = True
    if idx == len(sent) - 1:
        features['EOS'] = True
    
    return features

def sentTofeatures(sent):
    global global_count
    sentence = test_lines[global_count]
    bert_word_embeddings = bert_output(sentence)

    x = [wordToFeatures(sent, bert_word_embeddings, i) for i in range(len(sent))]
    global_count+=1
    return x

def sentTolabels(sent):
    return [label for token, postag, label in sent]

In [23]:
global_count = 0

In [28]:
len(test_sents)

133119

In [27]:
len(test_sents.pop())

0

In [24]:
with open('X_train_BERT.pickle', 'wb') as handle:
    pickle.dump(X_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('y_train_BERT.pickle', 'wb') as handle:
    pickle.dump(y_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

NameError: name 'X_train' is not defined

In [41]:
%%time
X_train = [sentTofeatures(s) for s in tqdm(train_sents)]
y_train = [sentTolabels(s) for s in train_sents]


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

CPU times: total: 5h 47min 29s
Wall time: 43min 49s





In [29]:
from tqdm import tqdm

In [55]:
with open('X_dev_BERT.pickle', 'wb') as handle:
    pickle.dump(X_dev, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('y_dev_BERT.pickle', 'wb') as handle:
    pickle.dump(y_dev, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
%%time
X_dev = [sentTofeatures(s) for s in tqdm(dev_sents)]
y_dev = [sentTolabels(s) for s in dev_sents]


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

CPU times: total: 17min 35s
Wall time: 2min 12s





In [31]:
from normalizer import normalize # pip install git+https://github.com/csebuetnlp/normalizer

In [32]:
test_sents[1383]

[('ডিমের', 'NC', 'B-PROD'),
 ('রোল', 'NC', 'I-PROD'),
 ('মেইল', 'VM', 'O'),
 ('\u200b\u200bলগইন', 'PU', 'O')]

In [33]:
bert_output(test_lines[1383])

array([10,  5, 11], dtype=int64)

In [25]:
%%time
X_test = [sentTofeatures(s) for s in tqdm(test_sents)]
y_test = [sentTolabels(s) for s in test_sents]

  9%|▉         | 11954/133120 [26:28<4:29:55,  7.48it/s]

KeyboardInterrupt: 

In [None]:
with open('X_test_BERT.pickle', 'wb') as handle:
    pickle.dump(X_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('y_test_BERT.pickle', 'wb') as handle:
    pickle.dump(y_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [56]:
with open('CRF_BERT.pickle', 'wb') as handle:
    pickle.dump(crf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
crf = crfsuite.CRF(
    verbose='true',
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=150,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, y_train)
except:
    pass




[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


[A[A[A


loading training data to CRFsuite: 100%|██████████| 15300/15300 [00:07<00:00, 1976.91it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 403670
Seconds required: 2.288

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 150
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=5.18  loss=377864.07 active=397340 feature_norm=0.12
Iter 2   time=1.04  loss=373609.78 active=388667 feature_norm=0.12
Iter 3   time=1.03  loss=371213.92 active=327923 feature_norm=0.61
Iter 4   time=10.21 loss=368415.32 active=357438 feature_norm=0.61
Iter 5   time=0.99  loss=196717.36 active=298574 feature_norm=0.33
Iter 6   time=5.00  loss=149261.68 active=325482 feature_norm=0.18
Iter 7   time=0.95  loss=145271.32 active=347445 feature_norm=0.19
Iter 8   time=1.00  loss=143707.49 active=358940 feature_norm=0.19
Iter 9   time=3.98  loss=142459.93 active=365100 feature_norm=

In [46]:
labels = list(crf.classes_)
labels

['O',
 'B-CORP',
 'B-GRP',
 'I-GRP',
 'B-PER',
 'I-PER',
 'I-CORP',
 'B-CW',
 'I-CW',
 'B-PROD',
 'I-PROD',
 'B-LOC',
 'I-LOC']

In [47]:
y_pred = crf.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='macro', labels=labels)

# 0.814325281001763
# 0.8075812824435331

0.8492314888974698

In [38]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='macro', labels=labels)

# 0.8009467990614277

0.8203321623402492

In [44]:
y_test[33]

['B-CW', 'O']

In [43]:
y_pred[33]

['O', 'O']