# Entity extraction using BERT

Full tutorial video: https://www.youtube.com/watch?v=MqQ7rqRllIc

## Import everything important

In [1]:
import joblib
import torch
import torch.nn as nn
import transformers

import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection

from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import gc



In [2]:
import datetime
print(datetime.datetime.now())

2023-07-20 14:11:17.063716


In [3]:
ner_data_path = '/kaggle/input/sd-key-columns/SD_key_columns.csv'

In [4]:
ner_file = pd.read_csv(ner_data_path)
ner_file.shape

(74439, 2)

In [5]:
ner_file.head(5)

Unnamed: 0,DBA_ID_f,OWNER_NAME
0,UB934057_0,David Elliott
1,UB934058_1,J and J Enterprises
2,UB934059_2,Subsidiary of Ascend One Corporation
3,UB934060_3,Subsidiary of Ascend One Corporation
4,UB934061_4,Subsidiary of Ascend One Corporation


In [6]:
ner_file = ner_file[['DBA_ID_f','OWNER_NAME']]

In [7]:
#ner_file = ner_file[:10000]
print(ner_file.shape)

(74439, 2)


In [8]:
ner_file.rename(columns = {'DBA_ID_f':'Charter_Num','OWNER_NAME':'Name'},inplace= True)

In [9]:
ner_file.head(5)

Unnamed: 0,Charter_Num,Name
0,UB934057_0,David Elliott
1,UB934058_1,J and J Enterprises
2,UB934059_2,Subsidiary of Ascend One Corporation
3,UB934060_3,Subsidiary of Ascend One Corporation
4,UB934061_4,Subsidiary of Ascend One Corporation


In [10]:
def process_test_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    
    df = df[['DBA_ID_f','OWNER_NAME']]
    #df = df[:10000]
    df.rename(columns = {'DBA_ID_f':'Charter_Num','OWNER_NAME':'Name'},inplace= True)
    df['Name'].fillna('unknown',inplace= True)
    print(df.head(5))
    #df['sent_len'] = df['Bus_Name'].str.split().len()
    
    #df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

    #enc_pos = preprocessing.LabelEncoder()
    #enc_tag = preprocessing.LabelEncoder()

    #df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
    #df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    
    long_data_frame = pd.DataFrame([(row[1].Charter_Num,word) for row in df[['Charter_Num','Name']].iterrows()
                         for word in row[1].Name.split()],columns= ['Charter_Num','word'])
    long_data_frame['pos'] = 0
    long_data_frame['tags'] = 0

    #sentences = long_data_frame.groupby(['Charter_Num'])['Bus_Name'].apply(list).values
    
    #pos=[[0] * len(sentences)], 
    #tags=[[0] * len(sentences)]
    
    sentences = long_data_frame.groupby("Charter_Num")["word"].apply(list).values
    pos = long_data_frame.groupby("Charter_Num")["pos"].apply(list).values
    tag = long_data_frame.groupby("Charter_Num")["tags"].apply(list).values
    Charter_Num_list = long_data_frame.groupby("Charter_Num")["Charter_Num"].apply(list).values
    
    return sentences, pos, tag,Charter_Num_list

In [11]:
#data_path = '../input/oh-formatch-v2-unknown/OH_formatch_v2_unknown.csv'
test_pred_sentences,test_pos,test_tag,Charter_Num_list = process_test_data(ner_data_path)

  Charter_Num                                  Name
0  UB934057_0                         David Elliott
1  UB934058_1                   J and J Enterprises
2  UB934059_2  Subsidiary of Ascend One Corporation
3  UB934060_3  Subsidiary of Ascend One Corporation
4  UB934061_4  Subsidiary of Ascend One Corporation


In [12]:
    df = pd.read_csv(ner_data_path, encoding="latin-1")
    df = df[['DBA_ID_f','OWNER_NAME']]
    #df = df[:10000]
    df.rename(columns = {'DBA_ID_f':'Charter_Num','OWNER_NAME':'Name'},inplace= True)
    df['Name'].fillna('unknown',inplace= True)
    print(df.head(5))
    #df['sent_len'] = df['Bus_Name'].str.split().len()
    
    #df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

    #enc_pos = preprocessing.LabelEncoder()
    #enc_tag = preprocessing.LabelEncoder()

    #df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
    #df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])
    
    long_data_frame = pd.DataFrame([(row[1].Charter_Num,word) for row in df[['Charter_Num','Name']].iterrows()
                         for word in row[1].Name.split()],columns= ['Charter_Num','word'])
    long_data_frame['pos'] = 0
    long_data_frame['tags'] = 0


  Charter_Num                                  Name
0  UB934057_0                         David Elliott
1  UB934058_1                   J and J Enterprises
2  UB934059_2  Subsidiary of Ascend One Corporation
3  UB934060_3  Subsidiary of Ascend One Corporation
4  UB934061_4  Subsidiary of Ascend One Corporation


In [13]:
long_data_frame

Unnamed: 0,Charter_Num,word,pos,tags
0,UB934057_0,David,0,0
1,UB934057_0,Elliott,0,0
2,UB934058_1,J,0,0
3,UB934058_1,and,0,0
4,UB934058_1,J,0,0
...,...,...,...,...
207439,UB233805_74600,Leiferman,0,0
207440,UB233806_74601,Cody,0,0
207441,UB233806_74601,Tyler,0,0
207442,UB233806_74601,Beitzel,0,0


## Some config

In [14]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 8
    EPOCHS = 3
    BASE_MODEL_PATH = "../input/bert-base-uncased/"
    MODEL_PATH = '/kaggle/input/entity-classification-model-bert/model.bin'
    TRAINING_FILE = "../input/entity-annotated-corpus/ner_dataset.csv"
    TOKENIZER = transformers.BertTokenizer.from_pretrained(
        BASE_MODEL_PATH,
        do_lower_case=True
    )

## Dataset

In [15]:
class EntityDataset:
    def __init__(self, texts, pos, tags):
        self.texts = texts
        self.pos = pos
        self.tags = tags
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        pos = self.pos[item]
        tags = self.tags[item]

        ids = []
        target_pos = []
        target_tag =[]

        for i, s in enumerate(text):
            inputs = config.TOKENIZER.encode(
                s,
                add_special_tokens=False
            )
            # abhishek: ab ##hi ##sh ##ek
            input_len = len(inputs)
            ids.extend(inputs)
            target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tags[i]] * input_len)

        ids = ids[:config.MAX_LEN - 2]
        target_pos = target_pos[:config.MAX_LEN - 2]
        target_tag = target_tag[:config.MAX_LEN - 2]

        ids = [101] + ids + [102]
        target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = config.MAX_LEN - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_pos = target_pos + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_pos": torch.tensor(target_pos, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

## Training and evaluation functions

In [16]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        _, _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

## Loss function and model

In [17]:
def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss


class EntityModel(nn.Module):
    def __init__(self, num_tag, num_pos):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        self.num_pos = num_pos
        self.bert = transformers.BertModel.from_pretrained(
            config.BASE_MODEL_PATH
        )
        self.bert_drop_1 = nn.Dropout(0.3)
        self.bert_drop_2 = nn.Dropout(0.3)
        self.out_tag = nn.Linear(768, self.num_tag)
        self.out_pos = nn.Linear(768, self.num_pos)
    
    def forward(
        self, 
        ids, 
        mask, 
        token_type_ids, 
        target_pos, 
        target_tag
    ):
        o1, _ = self.bert(
            ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids
        )

        bo_tag = self.bert_drop_1(o1)
        bo_pos = self.bert_drop_2(o1)

        tag = self.out_tag(bo_tag)
        pos = self.out_pos(bo_pos)

        loss_tag = loss_fn(tag, target_tag, mask, self.num_tag)
        loss_pos = loss_fn(pos, target_pos, mask, self.num_pos)

        loss = (loss_tag + loss_pos) / 2

        return tag, pos, loss

## Data processing

In [18]:
'''
def process_data(data_path):
    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
    df['Name'].fillna('unknown',inplace= True)

    enc_pos = preprocessing.LabelEncoder()
    enc_tag = preprocessing.LabelEncoder()

    df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])

    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    pos = df.groupby("Sentence #")["POS"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values
    return sentences, pos, tag, enc_pos, enc_tag
'''

'\ndef process_data(data_path):\n    df = pd.read_csv(data_path, encoding="latin-1")\n    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")\n    df[\'Name\'].fillna(\'unknown\',inplace= True)\n\n    enc_pos = preprocessing.LabelEncoder()\n    enc_tag = preprocessing.LabelEncoder()\n\n    df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])\n    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])\n\n    sentences = df.groupby("Sentence #")["Word"].apply(list).values\n    pos = df.groupby("Sentence #")["POS"].apply(list).values\n    tag = df.groupby("Sentence #")["Tag"].apply(list).values\n    return sentences, pos, tag, enc_pos, enc_tag\n'

In [19]:
df = pd.read_csv( "../input/entity-annotated-corpus/ner_dataset.csv", encoding="latin-1")
df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")
#df = df[:10000]

df.shape

(1048575, 4)

In [20]:
df.head(15)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


## Training

In [21]:
'''
sentences, pos, tag, enc_pos, enc_tag = process_data(config.TRAINING_FILE)

meta_data = {
    "enc_pos": enc_pos,
    "enc_tag": enc_tag
}

joblib.dump(meta_data, "meta.bin")

num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))

(
    train_sentences,
    test_sentences,
    train_pos,
    test_pos,
    train_tag,
    test_tag
) = model_selection.train_test_split(
    sentences, 
    pos, 
    tag, 
    random_state=42, 
    test_size=0.1
)

train_dataset = EntityDataset(
    texts=train_sentences, pos=train_pos, tags=train_tag
)

train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4
)

valid_dataset = EntityDataset(
    texts=test_sentences, pos=test_pos, tags=test_tag
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1
)

device = torch.device("cuda")
model = EntityModel(num_tag=num_tag, num_pos=num_pos)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.0,
    },
]

num_train_steps = int(
    len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCHS
)
optimizer = AdamW(optimizer_parameters, lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_train_steps
)

best_loss = np.inf
for epoch in range(config.EPOCHS):
    train_loss = train_fn(
        train_data_loader, 
        model, 
        optimizer, 
        device, 
        scheduler
    )
    test_loss = eval_fn(
        valid_data_loader,
        model,
        device
    )
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), config.MODEL_PATH)
        best_loss = test_loss
        
'''

'\nsentences, pos, tag, enc_pos, enc_tag = process_data(config.TRAINING_FILE)\n\nmeta_data = {\n    "enc_pos": enc_pos,\n    "enc_tag": enc_tag\n}\n\njoblib.dump(meta_data, "meta.bin")\n\nnum_pos = len(list(enc_pos.classes_))\nnum_tag = len(list(enc_tag.classes_))\n\n(\n    train_sentences,\n    test_sentences,\n    train_pos,\n    test_pos,\n    train_tag,\n    test_tag\n) = model_selection.train_test_split(\n    sentences, \n    pos, \n    tag, \n    random_state=42, \n    test_size=0.1\n)\n\ntrain_dataset = EntityDataset(\n    texts=train_sentences, pos=train_pos, tags=train_tag\n)\n\ntrain_data_loader = torch.utils.data.DataLoader(\n    train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4\n)\n\nvalid_dataset = EntityDataset(\n    texts=test_sentences, pos=test_pos, tags=test_tag\n)\n\nvalid_data_loader = torch.utils.data.DataLoader(\n    valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1\n)\n\ndevice = torch.device("cuda")\nmodel = EntityModel(num_tag=num_

## Inference

In [22]:
'''
Loreto House
Art Weeks
Rob Painter
Jacquelyn Minchew LPC
Soleil Salon
Nancy M Service, PhD
'''

'\nLoreto House\nArt Weeks\nRob Painter\nJacquelyn Minchew LPC\nSoleil Salon\nNancy M Service, PhD\n'

In [23]:


meta_data = joblib.load("/kaggle/input/entity-classification-model-bert/meta.bin")
enc_pos = meta_data["enc_pos"]
enc_tag = meta_data["enc_tag"]

num_pos = len(list(enc_pos.classes_))
num_tag = len(list(enc_tag.classes_))
sentence = """
ROSE M KENNELLY
"""
'''
sentence = """
Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a 
reporter for the network, about protests in Minnesota and elsewhere. 
"""
'''
#sentence = "George Washington went to Washington"

tokenized_sentence = config.TOKENIZER.encode(sentence)
input_ids = torch.tensor([tokenized_sentence]).cuda()
tokens = config.TOKENIZER.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])


sentence = sentence.split()
print(sentence)
print(tokens)
print(tokenized_sentence)

test_dataset = EntityDataset(
    texts=[sentence], 
    pos=[[0] * len(sentence)], 
    tags=[[0] * len(sentence)]
)

device = torch.device("cuda")
model = EntityModel(num_tag=num_tag, num_pos=num_pos)
model.load_state_dict(torch.load(config.MODEL_PATH))
model.to(device)

with torch.no_grad():
    data = test_dataset[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    tag, pos, _ = model(**data)

    print(
        enc_tag.inverse_transform(
            tag.argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]
    )
    
    '''
    print(
        enc_pos.inverse_transform(
            pos.argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]
    )
    '''

['ROSE', 'M', 'KENNELLY']
['[CLS]', 'rose', 'm', 'ken', '##nell', '##y', '[SEP]']
[101, 3123, 1049, 6358, 9091, 2100, 102]
['B-art' 'B-per' 'I-per' 'I-per' 'I-per' 'I-per' 'B-art']


In [24]:
model.load_state_dict(torch.load(config.MODEL_PATH))

<All keys matched successfully>

In [25]:
def ner_output(sentence):

    #sentence = "George Washington went to Washington"

    tokenized_sentence = config.TOKENIZER.encode(sentence)
    input_ids = torch.tensor([tokenized_sentence]).cuda()
    tokens = config.TOKENIZER.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])


    sentence = sentence.split()
    #print(sentence)
    #print(tokens)
    #print(tokenized_sentence[1:-1])

    test_dataset = EntityDataset(
        texts=[sentence], 
        pos=[[0] * len(sentence)], 
        tags=[[0] * len(sentence)]
    )

    #device = torch.device("cuda")
    #model = EntityModel(num_tag=num_tag, num_pos=num_pos)
    #model.load_state_dict(torch.load(config.MODEL_PATH))
    #model.to(device)

    with torch.no_grad():
        data = test_dataset[0]
        for k, v in data.items():
            data[k] = v.to(device).unsqueeze(0)
        tag, pos, _ = model(**data)
        
        '''
        print(
            enc_tag.inverse_transform(
                tag.argmax(2).cpu().numpy().reshape(-1)
            )[:len(tokenized_sentence)]
        )
      '''
    return tokens,enc_tag.inverse_transform( tag.argmax(2).cpu().numpy().reshape(-1))[:len(tokenized_sentence)]

In [26]:
df = pd.read_csv(ner_data_path, encoding="latin-1")
df = df = df[['DBA_ID_f','OWNER_NAME']]
#df = df[:10000]
df.rename(columns = {'DBA_ID_f':'Charter_Num','OWNER_NAME':'Name'},inplace= True)
df['Name'].fillna('unknown',inplace= True)
print(df.head(5))
#df = pd.read_csv(config.TRAINING_FILE, encoding="latin-1",nrows = 1000)
#df['sent_len'] = df['Bus_Name'].str.split().str.len()

  Charter_Num                                  Name
0  UB934057_0                         David Elliott
1  UB934058_1                   J and J Enterprises
2  UB934059_2  Subsidiary of Ascend One Corporation
3  UB934060_3  Subsidiary of Ascend One Corporation
4  UB934061_4  Subsidiary of Ascend One Corporation


In [27]:
df.head(5)

Unnamed: 0,Charter_Num,Name
0,UB934057_0,David Elliott
1,UB934058_1,J and J Enterprises
2,UB934059_2,Subsidiary of Ascend One Corporation
3,UB934060_3,Subsidiary of Ascend One Corporation
4,UB934061_4,Subsidiary of Ascend One Corporation


In [28]:
Entity_df = pd.DataFrame(test_pred_sentences)
Entity_df.columns = ['Name_Split']
Entity_df['Charter_Num_list'] = Charter_Num_list
Entity_df['Charter_Num']      = Entity_df['Charter_Num_list'].apply(lambda row: row[0] )
Entity_df #= pd.DataFrame()

Unnamed: 0,Name_Split,Charter_Num_list,Charter_Num
0,"[THE, RUDE, BAND, LLC]","[UB135909_48408, UB135909_48408, UB135909_4840...",UB135909_48408
1,"[Hello, Mortgage,, Inc.]","[UB135912_48409, UB135912_48409, UB135912_48409]",UB135912_48409
2,"[CONAGRA, FOODS, SALES,, LLC]","[UB135923_48410, UB135923_48410, UB135923_4841...",UB135923_48410
3,"[Megan, Brennan]","[UB135924_48411, UB135924_48411]",UB135924_48411
4,"[JBS, United,, Inc.]","[UB135928_48412, UB135928_48412, UB135928_48412]",UB135928_48412
...,...,...,...
74434,"[Brian, and, Candace, McKay]","[UB973241_48403, UB973241_48403, UB973241_4840...",UB973241_48403
74435,"[Sioux, Falls, Cryo,, LLC]","[UB973242_48404, UB973242_48404, UB973242_4840...",UB973242_48404
74436,"[Singh, BIr, Tamang]","[UB973243_48405, UB973243_48405, UB973243_48405]",UB973243_48405
74437,"[Man, Rai]","[UB973243_48406, UB973243_48406]",UB973243_48406


In [29]:
test_dataset = EntityDataset(
    texts=test_pred_sentences, pos=test_pos, tags=test_tag
)

test_data_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=128, num_workers=1
)

In [30]:
with torch.no_grad():
    tag_list= []
    for data in tqdm(test_data_loader, total=len(test_data_loader)):
            
            for k, v in data.items():
                data[k] = v.to(device)
            tag, pos, _ = model(**data)
            tag_list.append(tag)
            
tag_all = torch.cat(tag_list)

100%|██████████| 582/582 [04:13<00:00,  2.30it/s]


In [31]:
df.head(5)

Unnamed: 0,Charter_Num,Name
0,UB934057_0,David Elliott
1,UB934058_1,J and J Enterprises
2,UB934059_2,Subsidiary of Ascend One Corporation
3,UB934060_3,Subsidiary of Ascend One Corporation
4,UB934061_4,Subsidiary of Ascend One Corporation


In [32]:
df['tokenized_sent'] = df['Name'].apply(lambda row:  config.TOKENIZER.encode(row))

In [33]:
count = 0
enc_tag_list= []
for i in tag_all:
    
    enc_tag_list.append(enc_tag.inverse_transform(
            i[None,:,:].argmax(2).cpu().numpy().reshape(-1)[:len(df.loc[:,'tokenized_sent'][count])]
        ))
    count = count+1

In [34]:
Entity_df['enc_tag'] = enc_tag_list

In [35]:
Entity_df = pd.merge(Entity_df,df,how = 'inner')
Entity_df

Unnamed: 0,Name_Split,Charter_Num_list,Charter_Num,enc_tag,Name,tokenized_sent
0,"[THE, RUDE, BAND, LLC]","[UB135909_48408, UB135909_48408, UB135909_4840...",UB135909_48408,"[B-art, O, O, I-org]",THE RUDE BAND LLC,"[101, 1996, 12726, 2316, 11775, 102]"
1,"[Hello, Mortgage,, Inc.]","[UB135912_48409, UB135912_48409, UB135912_48409]",UB135912_48409,"[B-art, O, O, O, B-org, I-org]","Hello Mortgage, Inc.","[101, 7592, 14344, 1010, 4297, 1012, 102]"
2,"[CONAGRA, FOODS, SALES,, LLC]","[UB135923_48410, UB135923_48410, UB135923_4841...",UB135923_48410,"[B-art, B-org, B-org, B-org, I-org, I-org, I-o...","CONAGRA FOODS SALES, LLC","[101, 9530, 8490, 2527, 9440, 4341, 1010, 1177..."
3,"[Megan, Brennan]","[UB135924_48411, UB135924_48411]",UB135924_48411,"[B-art, B-per, I-per, B-art, B-per, O, B-per, O]",Megan Brennan,"[101, 12756, 13962, 102]"
4,"[JBS, United,, Inc.]","[UB135928_48412, UB135928_48412, UB135928_48412]",UB135928_48412,"[B-art, B-org, B-org, I-org, I-org, I-org, I-o...","JBS United, Inc.","[101, 1046, 5910, 2142, 1010, 4297, 1012, 102]"
...,...,...,...,...,...,...
74434,"[Brian, and, Candace, McKay]","[UB973241_48403, UB973241_48403, UB973241_4840...",UB973241_48403,"[B-art, B-per, O, B-per, I-per, B-art, B-per]",Brian and Candace McKay,"[101, 4422, 1998, 22905, 16225, 102]"
74435,"[Sioux, Falls, Cryo,, LLC]","[UB973242_48404, UB973242_48404, UB973242_4840...",UB973242_48404,"[B-art, B-geo, I-geo, I-geo, I-geo, O, O, B-art]","Sioux Falls Cryo, LLC","[101, 16615, 4212, 5390, 2080, 1010, 11775, 102]"
74436,"[Singh, BIr, Tamang]","[UB973243_48405, UB973243_48405, UB973243_48405]",UB973243_48405,"[B-art, B-per, I-org, I-org, I-org, I-org, B-art]",Singh BIr Tamang,"[101, 5960, 12170, 2099, 17214, 5654, 102]"
74437,"[Man, Rai]","[UB973243_48406, UB973243_48406]",UB973243_48406,"[B-art, B-per, I-per, B-art, B-per, O]",Man Rai,"[101, 2158, 15547, 102]"


In [36]:
Entity_df.to_csv('my3_Entity_BERT_SD_orig.csv',index= False)

In [37]:
import datetime
print(datetime.datetime.now())

2023-07-20 14:17:28.796258
