In [6]:
import pickle
from bnlp import POS
from tqdm import tqdm
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import metrics

In [7]:
def load_data(filename):
    with open(filename, 'r',encoding='utf-8') as f:
        data = [line.strip().split(' _ _ ') for line in f.readlines()]
        # return data
        # empty lines are used to separate sentences
        # separate them into sentences
        sentences = []
        cur = []
        for line in data:
            if line == ['']:
                cur = [tuple(line) for line in cur]
                sentences.append(cur)
                cur = []
            else:
                # for p in punctuations:
                #     line[0] = line[0].replace(p, '')
                if len(line[0]) == 0:
                    line[0] = ' '
                cur.append(line)
        # # convert each list to a tuple
        sentences.append(cur)
        return sentences

In [8]:
train = load_data('train.txt')
dev = load_data('dev.txt')
test = load_data('test.txt')

In [9]:
!git clone https://github.com/sagorbrur/bnlp

fatal: destination path 'bnlp' already exists and is not an empty directory.


In [10]:
bn_pos = POS()
model_path = "bnlp/model/bn_pos.pkl"

In [11]:
def add_pos(sentence):
    # first one of the tuple is the word and the second one is the ner  
    words = []
    for word in sentence:
        words.append(word[0])
    pos = bn_pos.tag(model_path, words)
    ## add ner back to the tuple
    ret = []
    for i in range(len(pos)):
        word = sentence[i][0]
        ner = sentence[i][1]
        ret.append((word, pos[i][1], ner))
    return ret

def add_pos_to_all(sents):
    ret = []
    for i in tqdm(range(len(sents))):
        # if i % 1000 == 0:
        #     print(i)
        ret.append(add_pos(sents[i]))
    return ret

In [12]:
train_sents =  add_pos_to_all(train)

100%|██████████| 15301/15301 [02:12<00:00, 115.80it/s]


In [13]:
train_sents[69]

[('আরাকি', 'NP', 'O'),
 ('বলেছিলেন', 'VM', 'O'),
 ('যে', 'CSB', 'O'),
 ('সোনালী', 'JJ', 'B-CW'),
 ('বাতাস', 'NC', 'I-CW'),
 ('এর', 'PPR', 'O'),
 ('মূল', 'JJ', 'O'),
 ('লক্ষ্য', 'NC', 'O'),
 ('ছিল', 'VM', 'O'),
 ('সুন্দর', 'JJ', 'O'),
 ('পুরুষদের', 'NC', 'O'),
 ('আঁকা,', 'NC', 'O'),
 ('যারা', 'PRL', 'O'),
 ('কেবল', 'AMN', 'O'),
 ('এমন', 'DAB', 'O'),
 ('একটি', 'JQ', 'O'),
 ('পৃথিবীতে', 'NC', 'O'),
 ('থাকতে', 'VM', 'O'),
 ('পারে', 'VAUX', 'O'),
 ('যেখানে', 'ALC', 'O'),
 ('একজনের', 'NC', 'O'),
 ('শাস্তি', 'NC', 'O'),
 ('পূরণের', 'NC', 'O'),
 ('সৌন্দর্য', 'NC', 'O'),
 ('আছে।', 'PU', 'O')]

In [14]:
dev_sents = add_pos_to_all(dev)

100%|██████████| 801/801 [00:07<00:00, 114.21it/s]


In [15]:
dev_sents[69]

[('২০০৯', 'RDF', 'O'),
 ('সালে,', 'NC', 'O'),
 ('রেকর্ডিং', 'NC', 'O'),
 ('লাইব্রেরি', 'NC', 'B-GRP'),
 ('অফ', 'NC', 'I-GRP'),
 ('কংগ্রেস', 'NP', 'I-GRP'),
 ('সংরক্ষণের', 'NC', 'O'),
 ('জন্য', 'PP', 'O'),
 ('নির্বাচিত', 'JJ', 'O'),
 ('হয়েছিল।', 'PU', 'O')]

In [24]:
test_sents = add_pos_to_all(test)

100%|██████████| 133120/133120 [19:20<00:00, 114.68it/s]


In [25]:
test_sents[2]

[('পেটিকোট', 'NC', 'B-PROD'),
 ('এর', 'PPR', 'O'),
 ('জন্য', 'PP', 'O'),
 ('সঠিক', 'JJ', 'O'),
 ('তাপমাত্রা', 'NC', 'O')]

In [5]:
# with open('saved_pickles/train_sents.pickle', 'wb') as handle:
#     pickle.dump(train_sents, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('saved_pickles/dev_sents.pickle', 'wb') as handle:
#     pickle.dump(dev_sents, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('saved_pickles/test_sents.pickle', 'wb') as handle:
#     pickle.dump(test_sents, handle, protocol=pickle.HIGHEST_PROTOCOL)

NameError: name 'train_sents' is not defined

In [16]:
# word frequency information

def get_all_words(sents):
    words = []
    for i in range(len(sents)):
        for w in sents[i]:
            words += [w[0]]
    return words

all_train_words = get_all_words(train_sents)
len(all_train_words)

word_freq = {}
for w in all_train_words:
    if w in word_freq:
        word_freq[w] += 1
    else:
        word_freq[w] = 1

In [17]:
# Gazzeter Implementation

GRP_bn = {}
LOC_bn = {}
PER_bn = {}
PROD_bn = {}
CORP_bn = {}
CW_bn = {}


with open('gazz_main/CORP.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        CORP_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                CORP_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/CW.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        CW_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                CW_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/GRP.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        GRP_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                GRP_bn[" ".join(words[i: i+size])] = 1


with open('gazz_main/LOC.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        LOC_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                LOC_bn[" ".join(words[i: i+size])] = 1

with open('gazz_main/PER.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        PER_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                PER_bn[" ".join(words[i: i+size])] = 1


with open('gazz_main/PROD.txt') as file:
    lines = [x.strip() for x in file.readlines()]
    for l in lines:
        PROD_bn[l] = 1
        words = l.split()
        for size in range(2, 4, 1):
            for i in range(len(words)):
                if (i + size) >= len(words):
                    break
                PROD_bn[" ".join(words[i: i+size])] = 1

In [27]:
# feature extraction for Conditional Random Field - Bangla NER

def wordToFeatures(sent, idx):
    word = sent[idx][0]
    postag = sent[idx][1]
    
    # cluster_id = kmeans.predict([w2v.get_vector(word)])[0]
    features = {
        'bias': 1.0,
        'word': word,
        # 'cluster_id': cluster_id,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word.isdigit': word.isdigit(),
        'index': idx,
        'length': len(word),
        'postag': postag,
        'freq': word_freq[word] if word in word_freq else 0,
        'is_corp': int(word in CORP_bn), 
        'is_cw': int(word in CW_bn), 
        'is_grp': int(word in GRP_bn), 
        'is_loc': int(word in LOC_bn), 
        'is_per': int(word in PER_bn), 
        'is_prod': int(word in PROD_bn),
    }
    
    if idx > 0:
        sub = sent[idx-1][0] + " "+ sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx < len(sent) - 1:
        sub = sent[idx][0] + " "+ sent[idx+1][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx > 1:
        sub = sent[idx-2][0] + " "+ sent[idx-1][0] + " " + sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    if idx < len(sent) - 2:
        sub = sent[idx][0] + " "+ sent[idx+1][0] + " " + sent[idx+2][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
        
    
    for i in range(1, 3):
        if idx < i:
            break
        wordi = sent[idx-i][0]
        postagi = sent[idx-i][1]
        # cluster_id_i = kmeans.predict([w2v.get_vector(wordi)])[0]
        features.update({
            '-{}:word'.format(i): wordi,
            # '-{}:cluster_id'.format(i): cluster_id_i,
            '-{}:word[-3:]'.format(i): wordi[-3:],
            '-{}:word[-2:]'.format(i): wordi[-2:],
            '-{}:word[:3]'.format(i): wordi[:3],
            '-{}:word[:2]'.format(i): wordi[:2],
            '-{}:word.isdigit'.format(i): wordi.isdigit(),
            '-{}:postag'.format(i): postagi,
            '-{}:is_corp'.format(i): int(word in CORP_bn), 
            '-{}:is_cw'.format(i): int(word in CW_bn), 
            '-{}:is_grp'.format(i): int(word in GRP_bn), 
            '-{}:is_loc'.format(i): int(word in LOC_bn), 
            '-{}:is_per'.format(i): int(word in PER_bn), 
            '-{}:is_prod'.format(i): int(word in PROD_bn),
        })
    
    for i in range(1, 3):
        if (idx+i) >= len(sent):
            break
        wordi = sent[idx+i][0]
        postagi = sent[idx+i][1]
        # cluster_id_i = kmeans.predict([w2v.get_vector(wordi)])[0]
        features.update({
            '{}:word'.format(i): wordi,
            # '{}:cluster_id'.format(i): cluster_id_i,
            '{}:word[-3:]'.format(i): wordi[-3:],
            '{}:word[-2:]'.format(i): wordi[-2:],
            '{}:word[:3]'.format(i): wordi[:3],
            '{}:word[:2]'.format(i): wordi[:2],
            '{}:word.isdigit'.format(i): wordi.isdigit(),
            '{}:postag'.format(i): postagi,
            '-{}:is_corp'.format(i): int(word in CORP_bn), 
            '-{}:is_cw'.format(i): int(word in CW_bn), 
            '-{}:is_grp'.format(i): int(word in GRP_bn), 
            '-{}:is_loc'.format(i): int(word in LOC_bn), 
            '-{}:is_per'.format(i): int(word in PER_bn), 
            '-{}:is_prod'.format(i): int(word in PROD_bn),
        })
        
    if idx == 0:
        features['BOS'] = True
    if idx == len(sent) - 1:
        features['EOS'] = True
    
    return features

def sentTofeatures(sent):
    return [wordToFeatures(sent, i) for i in range(len(sent))]

def sentTolabels(sent):
    return [label for token, postag, label in sent]

In [18]:
# newwwwwwwwwwwwwwwwwwwwwwwww

def wordToFeatures(sent, idx):
    word = sent[idx][0]
    postag = sent[idx][1]
    
    # cluster_id = kmeans.predict([w2v.get_vector(word)])[0]
    features = {
        'bias': 1.0,
        'word': word,
        # 'cluster_id': cluster_id,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word.isdigit': word.isdigit(),
        'index': idx,
        'length': len(word),
        'postag': postag,
        'freq': word_freq[word] if word in word_freq else 0,
        'is_corp': int(word in CORP_bn), 
        'is_cw': int(word in CW_bn), 
        'is_grp': int(word in GRP_bn), 
        'is_loc': int(word in LOC_bn), 
        'is_per': int(word in PER_bn), 
        'is_prod': int(word in PROD_bn),
    }
    
    if idx > 0:
        sub = sent[idx-1][0] + " "+ sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx < len(sent) - 1:
        sub = sent[idx][0] + " "+ sent[idx+1][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    
    if idx > 1:
        sub = sent[idx-2][0] + " "+ sent[idx-1][0] + " " + sent[idx][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
    if idx < len(sent) - 2:
        sub = sent[idx][0] + " "+ sent[idx+1][0] + " " + sent[idx+2][0]
        if sub in CORP_bn:
            features['is_corp'] = 2
        if sub in CW_bn:
            features['is_cw'] = 2
        if sub in GRP_bn:
            features['is_grp'] = 2
        if sub in LOC_bn:
            features['is_loc'] = 2
        if sub in PER_bn:
            features['is_per'] = 2
        if sub in PROD_bn:
            features['is_prod'] = 2
        
    
    for i in range(1, 3):
        if idx < i:
            break
        wordi = sent[idx-i][0]
        postagi = sent[idx-i][1]
        # cluster_id_i = kmeans.predict([w2v.get_vector(wordi)])[0]
        features.update({
            '-{}:word'.format(i): wordi,
            # '-{}:cluster_id'.format(i): cluster_id_i,
            '-{}:word[-3:]'.format(i): wordi[-3:],
            '-{}:word[-2:]'.format(i): wordi[-2:],
            '-{}:word[:3]'.format(i): wordi[:3],
            '-{}:word[:2]'.format(i): wordi[:2],
            '-{}:word.isdigit'.format(i): wordi.isdigit(),
            '-{}:postag'.format(i): postagi,
            '-{}:is_corp'.format(i): int(wordi in CORP_bn), 
            '-{}:is_cw'.format(i): int(wordi in CW_bn), 
            '-{}:is_grp'.format(i): int(wordi in GRP_bn), 
            '-{}:is_loc'.format(i): int(wordi in LOC_bn), 
            '-{}:is_per'.format(i): int(wordi in PER_bn), 
            '-{}:is_prod'.format(i): int(wordi in PROD_bn),
        })
    
    for i in range(1, 3):
        if (idx+i) >= len(sent):
            break
        wordi = sent[idx+i][0]
        postagi = sent[idx+i][1]
        # cluster_id_i = kmeans.predict([w2v.get_vector(wordi)])[0]
        features.update({
            '{}:word'.format(i): wordi,
            # '{}:cluster_id'.format(i): cluster_id_i,
            '{}:word[-3:]'.format(i): wordi[-3:],
            '{}:word[-2:]'.format(i): wordi[-2:],
            '{}:word[:3]'.format(i): wordi[:3],
            '{}:word[:2]'.format(i): wordi[:2],
            '{}:word.isdigit'.format(i): wordi.isdigit(),
            '{}:postag'.format(i): postagi,
            '-{}:is_corp'.format(i): int(wordi in CORP_bn), 
            '-{}:is_cw'.format(i): int(wordi in CW_bn), 
            '-{}:is_grp'.format(i): int(wordi in GRP_bn), 
            '-{}:is_loc'.format(i): int(wordi in LOC_bn), 
            '-{}:is_per'.format(i): int(wordi in PER_bn), 
            '-{}:is_prod'.format(i): int(wordi in PROD_bn),
        })
        
    if idx == 0:
        features['BOS'] = True
    if idx == len(sent) - 1:
        features['EOS'] = True
    
    return features

def sentTofeatures(sent):
    return [wordToFeatures(sent, i) for i in range(len(sent))]

def sentTolabels(sent):
    return [label for token, postag, label in sent]

In [19]:
%%time
X_train = [sentTofeatures(s) for s in train_sents]
y_train = [sentTolabels(s) for s in train_sents]

CPU times: total: 4.33 s
Wall time: 4.33 s


In [20]:
%%time
X_dev = [sentTofeatures(s) for s in dev_sents]
y_dev = [sentTolabels(s) for s in dev_sents]

CPU times: total: 234 ms
Wall time: 240 ms


In [26]:
%%time
X_test = [sentTofeatures(s) for s in test_sents]
y_test = [sentTolabels(s) for s in test_sents]

CPU times: total: 14.2 s
Wall time: 14.2 s


In [21]:
crf = crfsuite.CRF(
    verbose='true',
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=150,
    all_possible_transitions=True
)

try:
    crf.fit(X_train, y_train)
except:
    pass

loading training data to CRFsuite: 100%|██████████| 15301/15301 [00:05<00:00, 3051.73it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 396086
Seconds required: 1.626

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 150
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=3.24  loss=382243.43 active=389773 feature_norm=0.12
Iter 2   time=0.65  loss=379054.96 active=381541 feature_norm=0.12
Iter 3   time=0.65  loss=180566.12 active=330603 feature_norm=0.37
Iter 4   time=2.59  loss=179367.26 active=370275 feature_norm=0.42
Iter 5   time=1.28  loss=169263.38 active=341757 feature_norm=0.29
Iter 6   time=0.65  loss=164569.06 active=355790 feature_norm=0.33
Iter 7   time=1.32  loss=162782.85 active=367587 feature_norm=0.35
Iter 8   time=0.67  loss=162209.30 active=369662 feature_norm=0.35
Iter 9   time=1.28  loss=161617.59 active=369933 feature_norm=

In [22]:
labels = list(crf.classes_)
labels

['O',
 'B-CORP',
 'B-GRP',
 'I-GRP',
 'B-PER',
 'I-PER',
 'I-CORP',
 'B-CW',
 'I-CW',
 'B-PROD',
 'I-PROD',
 'B-LOC',
 'I-LOC']

In [23]:
y_pred = crf.predict(X_dev)
metrics.flat_f1_score(y_dev, y_pred,
                      average='macro', labels=labels)

# 0.814325281001763

0.8075812824435331

In [27]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='macro', labels=labels)

0.8009467990614277

In [49]:
y_test[69]

['B-CORP', 'O', 'O']

In [50]:
y_pred[69]

['B-CORP', 'I-CORP', 'O']