In [48]:
import torch
from collections import OrderedDict
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [75]:
def bert_parser(entity_lst):
    orgs = []
    n_entities = len(entity_lst)
    org_dict = OrderedDict()
    for word_dict in entity_lst:
        
        cur_word = word_dict['word'].replace("#", "")
        print(cur_word)
        start_pos = word_dict['start']
        end_pos = word_dict['end']
        entity_type = word_dict['entity']
        
        print(entity_type)
        if entity_type == 'B-ORG':    
            org_dict[cur_word] = {'start':start_pos, 'end':end_pos}
            print("Item added:", cur_word)
        
        elif entity_type == 'I-ORG':
            
            prev_entity = list(org_dict.items())[-1]
            prev_word = prev_entity[0]
            new_word = prev_word + cur_word
            new_endpos = end_pos
            
            del org_dict[prev_word]
            
            prev_start = prev_entity[1]['start']
            org_dict[new_word] = {'start':prev_start, 'end':new_endpos}

            
    return org_dict

In [26]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_classifier = pipeline("ner", model=model, tokenizer=tokenizer)

Downloading: 100%|██████████| 59.0/59.0 [00:00<00:00, 72.2kB/s]
Downloading: 100%|██████████| 829/829 [00:00<00:00, 1.02MB/s]
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 516kB/s] 
Downloading: 100%|██████████| 2.00/2.00 [00:00<00:00, 2.40kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 136kB/s]
Downloading: 100%|██████████| 433M/433M [01:31<00:00, 4.75MB/s] 


In [34]:
ner_classifier("Conflict of interest Name is an employee and stockholder of Novo Nordisk A/S.")

[{'entity': 'B-ORG',
  'score': 0.9925085,
  'index': 12,
  'word': 'Nov',
  'start': 60,
  'end': 63},
 {'entity': 'I-ORG',
  'score': 0.99516684,
  'index': 13,
  'word': '##o',
  'start': 63,
  'end': 64},
 {'entity': 'I-ORG',
  'score': 0.9968747,
  'index': 14,
  'word': 'Nord',
  'start': 65,
  'end': 69},
 {'entity': 'I-ORG',
  'score': 0.9954952,
  'index': 15,
  'word': '##isk',
  'start': 69,
  'end': 72},
 {'entity': 'I-ORG',
  'score': 0.993123,
  'index': 16,
  'word': 'A',
  'start': 73,
  'end': 74},
 {'entity': 'I-ORG',
  'score': 0.9313874,
  'index': 17,
  'word': '/',
  'start': 74,
  'end': 75},
 {'entity': 'I-ORG',
  'score': 0.9950965,
  'index': 18,
  'word': 'S',
  'start': 75,
  'end': 76}]

In [84]:
bert_out = ner_classifier("I have conflicting interests with Microsoft, Google and Myntra")

In [86]:
bert_parser(bert_out)

Microsoft
B-ORG
Item added: Microsoft
Google
B-ORG
Item added: Google
My
B-ORG
Item added: My
nt
I-ORG
ra
I-ORG


OrderedDict([('Microsoft', {'start': 34, 'end': 43}),
             ('Google', {'start': 45, 'end': 51}),
             ('Myntra', {'start': 56, 'end': 62})])

In [36]:
ner_classifier("I have conflicting interests with Microsoft, Google and Novartis Pharmaceuticals")

[{'entity': 'B-ORG',
  'score': 0.9990582,
  'index': 6,
  'word': 'Microsoft',
  'start': 34,
  'end': 43},
 {'entity': 'B-ORG',
  'score': 0.9991136,
  'index': 8,
  'word': 'Google',
  'start': 45,
  'end': 51},
 {'entity': 'B-ORG',
  'score': 0.9992143,
  'index': 10,
  'word': 'Nova',
  'start': 56,
  'end': 60},
 {'entity': 'I-ORG',
  'score': 0.99821794,
  'index': 11,
  'word': '##rt',
  'start': 60,
  'end': 62},
 {'entity': 'I-ORG',
  'score': 0.9986222,
  'index': 12,
  'word': '##is',
  'start': 62,
  'end': 64},
 {'entity': 'I-ORG',
  'score': 0.99932516,
  'index': 13,
  'word': 'Ph',
  'start': 65,
  'end': 67},
 {'entity': 'I-ORG',
  'score': 0.99864125,
  'index': 14,
  'word': '##arma',
  'start': 67,
  'end': 71},
 {'entity': 'I-ORG',
  'score': 0.9989389,
  'index': 15,
  'word': '##ce',
  'start': 71,
  'end': 73},
 {'entity': 'I-ORG',
  'score': 0.99923664,
  'index': 16,
  'word': '##utical',
  'start': 73,
  'end': 79},
 {'entity': 'I-ORG',
  'score': 0.91506857

In [39]:
ner_classifier("Novartis Pharma")

[{'entity': 'B-ORG',
  'score': 0.99914557,
  'index': 1,
  'word': 'Nova',
  'start': 0,
  'end': 4},
 {'entity': 'I-ORG',
  'score': 0.9940522,
  'index': 2,
  'word': '##rt',
  'start': 4,
  'end': 6},
 {'entity': 'I-ORG',
  'score': 0.99879855,
  'index': 3,
  'word': '##is',
  'start': 6,
  'end': 8},
 {'entity': 'I-ORG',
  'score': 0.9992811,
  'index': 4,
  'word': 'Ph',
  'start': 9,
  'end': 11},
 {'entity': 'I-ORG',
  'score': 0.9982556,
  'index': 5,
  'word': '##arma',
  'start': 11,
  'end': 15}]

In [45]:
a = ''
s = 'w'
a += 'd' if s is 'r' else ''

In [81]:
bert_out = ner_classifier("I have conflicting interests with Bloomberg, LayerIV and Myntra")

In [83]:
bert_out

[{'entity': 'B-ORG',
  'score': 0.994367,
  'index': 6,
  'word': 'Bloomberg',
  'start': 34,
  'end': 43},
 {'entity': 'B-ORG',
  'score': 0.99899155,
  'index': 8,
  'word': 'Lay',
  'start': 45,
  'end': 48},
 {'entity': 'B-ORG',
  'score': 0.9952924,
  'index': 9,
  'word': '##er',
  'start': 48,
  'end': 50},
 {'entity': 'I-ORG',
  'score': 0.99627745,
  'index': 10,
  'word': '##I',
  'start': 50,
  'end': 51},
 {'entity': 'I-ORG',
  'score': 0.9964791,
  'index': 11,
  'word': '##V',
  'start': 51,
  'end': 52},
 {'entity': 'B-ORG',
  'score': 0.9985757,
  'index': 13,
  'word': 'My',
  'start': 57,
  'end': 59},
 {'entity': 'I-ORG',
  'score': 0.9959242,
  'index': 14,
  'word': '##nt',
  'start': 59,
  'end': 61},
 {'entity': 'I-ORG',
  'score': 0.97845435,
  'index': 15,
  'word': '##ra',
  'start': 61,
  'end': 63}]

In [82]:
bert_parser(bert_out)

Bloomberg
B-ORG
Item added: Bloomberg
Lay
B-ORG
Item added: Lay
er
B-ORG
Item added: er
I
I-ORG
V
I-ORG
My
B-ORG
Item added: My
nt
I-ORG
ra
I-ORG


OrderedDict([('Bloomberg', {'start': 34, 'end': 43}),
             ('Lay', {'start': 45, 'end': 48}),
             ('erIV', {'start': 48, 'end': 52}),
             ('Myntra', {'start': 57, 'end': 63})])

In [49]:
r = OrderedDict()

In [50]:
r[1] = 0
r[2] = 1

In [51]:
r

OrderedDict([(1, 0), (2, 1)])

In [56]:
k = list(r.items())[-1]

In [59]:
k[0]

2

In [61]:
r[2] = {'a':1, 'b':3}

In [62]:
r

OrderedDict([(1, 0), (2, {'a': 1, 'b': 3})])

In [68]:
bert_out = ner_classifier("Conflict of interest Name is an employee and stockholder of Novo Nordisk A/S.")

In [76]:
bert_parser(bert_out)

Nov
B-ORG
Item added: Nov
o
I-ORG
Nord
I-ORG
isk
I-ORG
A
I-ORG
/
I-ORG
S
I-ORG


OrderedDict([('NovoNordiskA/S', {'start': 60, 'end': 76})])

In [77]:
query = "Conflict of interest Name is an employee and stockholder of Novo Nordisk A/S."

In [80]:
query[59]

' '