In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(r'C:\Users\us_Ma\Desktop\Share\material_modified.txt', "r", encoding='utf-8') as f:
    origin_txt = f.read()
    # 按文章分割
    origin_txt = origin_txt.replace('\n\n\n\n','\n\n\n')
    artical_list = origin_txt.split('\n\n\n')
    # 去除不包含摘要的文章
    artical_list = [artical for artical in artical_list if "O\n\nNo O\nabstract O\navailable. O" not in artical]
    
artical_list[0]

'1 O\nAD1006929 O\n\nThe O\nobjective O\nof O\nthe O\nresearch O\nwas O\nto O\ninvestigate O\nand O\ndetermine O\nthe O\nmechanism O\nthat O\nproduced O\nelectrically-generated B-theory\nelectron M-theory\nspin M-theory\npolarization E-theory\nin O\nnon-magnetic B-structure\nsemiconductor M-structure\nheterostructures E-structure\n. O\nElectrically-generated B-theory\nelectron M-theory\nspin M-theory\npolarization E-theory\nwas O\nshown O\nto O\nbe O\ninversely O\nproportional O\nto O\nthe O\nmeasured O\nmomentum-dependent B-theory\nspin M-theory\nsplitting E-theory\nin O\nstrained O\nindium B-material\ngallium M-material\narsenide E-material\n, O\ncontrary O\nto O\ntheoretical O\nexpectation. O\nThe O\nmeasurements O\nwere O\nconducted O\nby O\nsystematically O\nvarying O\nthe O\ndirection O\nand O\nmagnitude O\nof O\nthe O\nin-plane O\ncurrent O\nand O\nnet O\ndrift O\nmomentum O\nin O\na O\ndevice O\nwith O\na O\ncross-bargeometry. O\nThe O\nrole O\nof O\nelectrically-generated O\ne

In [3]:
len(artical_list)

700

In [4]:
# 将标记信息存储为DataFrame
df = pd.DataFrame()
for artical in artical_list[101:]:
    word_list = artical.split('\n')
    artical_id = word_list[1].split(' ')[0]
    sentence_id = 0
    for word_tagged in word_list[2:]:
        
        # 处理空字符
        if word_tagged == "":
            continue
        word = word_tagged.split(' ')[0]
        tag = word_tagged.split(' ')[1] 
        
        # Turn BIOES to BIO
        if tag[0] == 'E':
            tag = 'I' + tag[1:]
        if tag[0] == 'S':
            tag = 'B' + tag[1:]
        if tag[0] == 'M':
            tag = 'I' + tag[1:]   
        if tag[2:] == 'phrase':
            tag = 'O'
        if tag[2:] == 'structure' or tag[2:] == 'experiment':
            tag = tag[:2] + 'chemical'
        if tag[2:] == 'coponent':
            tag = tag[:2] + 'component'
        
        error_list = ['GaAs', 'Ga', 'InP', 'ZnSe', 'CdTe']
        if word in error_list and tag[0] != 'O':
            tag = tag[:2] + 'material'
        
        if word == '':
            continue
        
        # 处理包含标点的word
        if word == '.':
            new = pd.DataFrame({
              'word': word,
              'tag': tag,
              'artical_id': artical_id,
              'sentence_id': sentence_id
              },index=[0])
            sentence_id += 1   
        elif word == ',':
            new = pd.DataFrame({
              'word': word,
              'tag': tag,
              'artical_id': artical_id,
              'sentence_id': sentence_id
              },index=[0]) 
        elif ',' == word[-1] and not word[-2].isupper():
            new = pd.DataFrame([{
              'word': word[:-1],
              'tag': tag,
              'artical_id': artical_id,
              'sentence_id': sentence_id
              },
              {
              'word': word[-1],
              'tag': 'O',
              'artical_id': artical_id,
              'sentence_id': sentence_id
              }], index=[0,1])
                
        elif '.' == word[-1] and not word[-2].isupper():
            #word = re.sub('[,.]', '', word)
            new = pd.DataFrame([{
              'word': word[:-1],
              'tag': tag,
              'artical_id': artical_id,
              'sentence_id': sentence_id
              },
              {
              'word': word[-1],
              'tag': 'O',
              'artical_id': artical_id,
              'sentence_id': sentence_id
              }], index=[0,1])
            sentence_id += 1
        else:
            new = pd.DataFrame({
                  'word': word,
                  'tag': tag,
                  'artical_id': artical_id,
                  'sentence_id': sentence_id
                  }, index=[0])
        df = df.append(new, ignore_index=True)
df.head()

Unnamed: 0,word,tag,artical_id,sentence_id
0,We,O,ADA154997,0
1,present,O,ADA154997,0
2,an,O,ADA154997,0
3,Ensemble,B-technology,ADA154997,0
4,Monte,I-technology,ADA154997,0


In [5]:
tag_list_sorted = sorted(list(df.tag.unique()),key=lambda s:s[1:])
labels_to_ids = {k: v for v, k in enumerate(tag_list_sorted)}
ids_to_labels = {v: k for v, k in enumerate(tag_list_sorted)}
labels_to_ids

{'O': 0,
 'B-Auxiliary': 1,
 'I-Auxiliary': 2,
 'B-application': 3,
 'I-application': 4,
 'B-component': 5,
 'I-component': 6,
 'B-material': 7,
 'I-material': 8,
 'B-organization': 9,
 'I-organization': 10,
 'B-technology': 11,
 'I-technology': 12,
 'B-theory': 13,
 'I-theory': 14}

In [6]:
# 将单词和标记连接成句
df['sentence'] = df.groupby(['artical_id','sentence_id'])['word'].transform(lambda x : ' '.join(x))
df['word_labels'] = df.groupby(['artical_id','sentence_id'])['tag'].transform(lambda x : ','.join(x))
df.head()

Unnamed: 0,word,tag,artical_id,sentence_id,sentence,word_labels
0,We,O,ADA154997,0,We present an Ensemble Monte Carlo ( EMC ) stu...,"O,O,O,B-technology,I-technology,I-technology,O..."
1,present,O,ADA154997,0,We present an Ensemble Monte Carlo ( EMC ) stu...,"O,O,O,B-technology,I-technology,I-technology,O..."
2,an,O,ADA154997,0,We present an Ensemble Monte Carlo ( EMC ) stu...,"O,O,O,B-technology,I-technology,I-technology,O..."
3,Ensemble,B-technology,ADA154997,0,We present an Ensemble Monte Carlo ( EMC ) stu...,"O,O,O,B-technology,I-technology,I-technology,O..."
4,Monte,I-technology,ADA154997,0,We present an Ensemble Monte Carlo ( EMC ) stu...,"O,O,O,B-technology,I-technology,I-technology,O..."


In [7]:
data = df[["sentence", "word_labels","artical_id"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels,artical_id
0,We present an Ensemble Monte Carlo ( EMC ) stu...,"O,O,O,B-technology,I-technology,I-technology,O...",ADA154997
1,"the plasma oscillations , to the scattering te...","O,B-technology,I-technology,O,O,O,B-technology...",ADA154997
2,The transient dynamic response of electrons un...,"O,O,O,O,O,O,O,O,O,O,O,B-material,O,O,O,O,O,O,O...",ADA154997
3,Studies comparing the Gunn effect operation wi...,"O,O,O,B-theory,I-theory,I-theory,O,O,O,O,B-the...",N7221801
4,To overcome difficulties met during earlier [w...,"O,O,O,O,O,O,O,O,O,O,O,O,B-component,I-componen...",N7221801


In [8]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 2
EPOCHS = 12
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
# import the pretrained tokenizer of Hugging Face
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [9]:
class dataset():
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True,        # cut the sentence if the length is greater than the MAX_LEN
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        # convert labels to ids based on label_to_ids
        labels = [labels_to_ids[label] for label in word_labels] 
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        # set only labels whose first offset position is 0 and the second is not 0, which means it's the first section of a whole word
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                # overwrite label
                encoded_labels[idx] = labels[i]
                i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        # Convert the data into a torch.Tensor
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [10]:
# Split dataset into TrainDataset and TestDataset
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200) 
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# Get the dataset that have passed through the tokenizer
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
total_set = dataset(data, tokenizer, MAX_LEN)

FULL Dataset: (3284, 3)
TRAIN Dataset: (2627, 3)
TEST Dataset: (657, 3)


In [11]:
training_set[3]
# inputs: The model package all the words into a dict with id, each id represents a word
# token_type_ids: 0 for the first sentence and special symbol and 1 for the sencond sentence
# attention_mask: set 0 if the corresponding position is [PAD]
# offset_mapping: the slice of a subword position based on a whole word
# labels: the IOB tag ids of a sentence represented 

{'input_ids': tensor([  101,  1109,  4884,  4071,  1132, 11267, 18311,  1918, 19975, 20370,
           113,  4493,  2069,   114,   117, 11267,   118,  4272,  2702, 20370,
           113,   142, 16769,  9565,   114,   117,  1134, 14215,  4493,  2069,
          1114,  4272,  8364, 20370,   113,   151, 21148,   114,   117,  1105,
         10312,  1193, 11168,  8364, 20370,   113,   152, 20002,  2069,   114,
           117,  1107,  1134, 11432,  1104,  1103,  4493,  2069,  4365,  2258,
         10645,  2607,  1104,   170,  6307,  7776,  8515, 14797,  4344,   119,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [12]:
i=3
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[i]["input_ids"]), training_set[i]["labels"]):
      print('{0:10}  {1}'.format(token, label))

[CLS]       -100
The         0
techniques  0
employed    0
are         0
electron    11
para        12
##ma        -100
##gnetic    -100
resonance   12
(           0
EP          11
##R         -100
)           0
,           0
electron    11
-           -100
nuclear     -100
double      12
resonance   12
(           0
E           11
##ND        -100
##OR        -100
)           0
,           0
which       0
combines    0
EP          11
##R         -100
with        11
nuclear     12
magnetic    12
resonance   12
(           0
N           11
##MR        -100
)           0
,           0
and         0
optical     11
##ly        -100
detected    12
magnetic    12
resonance   12
(           0
O           11
##DM        -100
##R         -100
)           0
,           0
in          0
which       0
detection   0
of          0
the         0
EP          0
##R         -100
occurs      0
via         0
induced     0
changes     0
of          0
a           0
photo       0
##lum       -100
##ines      

In [13]:
train_dataset.sentence[1]

'The drift mobility of electron holes in single crystal and polycrystalline MnO , CoO and NiO was determined in the temperature range of about 1000 to 1300C by combined electrical conductivity and either thermal emf or thermogravimetric measurements .'

In [14]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [15]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,   # specify how the data loader obtains batches of dataset keys
                'shuffle': True,                  # whether the sampler is random
                'num_workers': 0                  # turn on multi-process data loading with the specified number of loader worker processes.
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

# DataLoader represents a Python iterable over a dataset, and the most important argument of DataLoader constructor is dataset
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [16]:
# import the pretrained model of Hugging face
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(labels_to_ids))
model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [17]:
# inputs = training_set[2]
# input_ids = inputs["input_ids"].unsqueeze(0)
# attention_mask = inputs["attention_mask"].unsqueeze(0)
# labels = inputs["labels"].unsqueeze(0)

# input_ids = input_ids.to(device)
# attention_mask = attention_mask.to(device)
# labels = labels.to(device)

# outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
# initial_loss = outputs[0]
# initial_loss

In [18]:
# optimizer could operate the parameters in the model （transfer address）
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [19]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    ne_count, ne_recognization, ne_irrelevant = {}, {}, {}
    ne_recognization_total, ne_irrelevant_total, ne_count_total = 0, 0, 0
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)
        output = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = output.loss
        tr_logits = output.logits
        tr_loss += loss.item()
        
        # print(loss.item())
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels that are not padding labels
        active_accuracy = labels.view(-1) != -100# shape (batch_size, seq_len)
        # active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
    
        # extract the active labels according to active_accuracy
        labels_masked = torch.masked_select(flattened_targets, active_accuracy)
        predictions_masked = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels_masked)
        tr_preds.extend(predictions_masked)
        
        # compute the accuracy of the prediction
        tmp_tr_accuracy = accuracy_score(labels_masked.cpu().numpy(), predictions_masked.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        
        tem_ne_count, tem_ne_recognization, tem_ne_irrelevant = entity_count(labels_masked.cpu().numpy(), predictions_masked.cpu().numpy())
        for key in tem_ne_count.keys():
                ne_count[key] = ne_count.get(key, 0) + tem_ne_count[key]
                ne_recognization[key] = ne_recognization.get(key, 0) + tem_ne_recognization[key]
                ne_irrelevant[key] = ne_irrelevant.get(key, 0) + tem_ne_irrelevant[key]
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        parameters = list(model.parameters())[:]
        
        # backward pass
        optimizer.zero_grad()  # wipe the gradient
        loss.backward()        # compute the gradient
        optimizer.step()       # update the parameter
        # print(list(model.parameters()) == parameters)

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    for key in ne_count.keys():
        ne_recognization_total += ne_recognization[key]
        ne_irrelevant_total += ne_irrelevant[key]
        ne_count_total += ne_count[key]
       
    recall_ratio_total = ne_recognization_total/ne_count_total
    precision_ratio_total = ne_recognization_total/(ne_irrelevant_total+ne_recognization_total)
    if precision_ratio_total+recall_ratio_total == 0:
        f1_score_total = 0
    else:
        f1_score_total = 2*precision_ratio_total*recall_ratio_total/(precision_ratio_total+recall_ratio_total)
    
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    print(f"Name Entity Count: {ne_count_total}, Name Entity Recognization: {ne_recognization_total}, Name Entity Irrelevant: {ne_irrelevant_total}")
    print(f"Recall Ratio: {recall_ratio_total}  Precision Ratio: {precision_ratio_total}")
    print(f"F1 Score: {f1_score_total}")

In [20]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    ne_count, ne_recognization, ne_irrelevant = {}, {}, {}
    ne_recognization_total, ne_irrelevant_total, ne_count_total = 0, 0, 0
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            output = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = output[0]
            eval_logits = output[1]
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)          
            
            # Step 1: compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            
            # compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            
            labels_masked = torch.masked_select(flattened_targets, active_accuracy)
            predictions_masked = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels_masked)
            eval_preds.extend(predictions_masked)
            
            # compute the accuracy of the prediction
            tmp_eval_accuracy = accuracy_score(labels_masked.cpu().numpy(), predictions_masked.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
            
            tem_ne_count, tem_ne_recognization, tem_ne_irrelevant = entity_count(labels_masked.cpu().numpy(), predictions_masked.cpu().numpy())
            for key in tem_ne_count.keys():
                ne_count[key] = ne_count.get(key, 0) + tem_ne_count[key]
                ne_recognization[key] = ne_recognization.get(key, 0) + tem_ne_recognization[key]
                ne_irrelevant[key] = ne_irrelevant.get(key, 0) + tem_ne_irrelevant[key]
            

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    
    
    for key in ne_count.keys():
        if ne_count[key] == 0:
            recall_ratio = 0
        else:
            recall_ratio = ne_recognization[key]/ne_count[key]
        if ne_irrelevant[key]+ne_recognization[key] == 0:
            precision_ratio=0
        else:
            precision_ratio = ne_recognization[key]/(ne_irrelevant[key]+ne_recognization[key])
        if precision_ratio+recall_ratio == 0:
            f1_score = 0
        else:
            f1_score = 2*precision_ratio*recall_ratio/(precision_ratio+recall_ratio)
        print(key+'********************')
        print(f"Name Entity Count: {ne_count[key]}, Name Entity Recognization: {ne_recognization[key]}, Name Entity Irrelevant: {ne_irrelevant[key]}")
        print(f"Recall Ratio: {recall_ratio}  Precision Ratio: {precision_ratio}")
        print(f"F1 Score: {f1_score}")
        ne_recognization_total += ne_recognization[key]
        ne_irrelevant_total += ne_irrelevant[key]
        ne_count_total += ne_count[key]
       
    recall_ratio_total = ne_recognization_total/ne_count_total
    precision_ratio_total = ne_recognization_total/(ne_irrelevant_total+ne_recognization_total)
    if precision_ratio_total+recall_ratio_total == 0:
        f1_score_total = 0
    else:
        f1_score_total = 2*precision_ratio_total*recall_ratio_total/(precision_ratio_total+recall_ratio_total)
    
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"Name Entity Count: {ne_count_total}, Name Entity Recognization: {ne_recognization_total}, Name Entity Irrelevant: {ne_irrelevant_total}")
    print(f"Recall Ratio: {recall_ratio_total}  Precision Ratio: {precision_ratio_total}")
    print(f"F1 Score: {f1_score_total}")

    return labels, predictions

In [21]:
def entity_count(labels, prediction):
    label_name_entity = get_name_entity(labels)
    prediction_name_entity = get_name_entity(prediction)
    ne_count, ne_recognization, ne_irrelevant = {}, {}, {} 
    for key in label_name_entity.keys():
        ne_count[key] = len(label_name_entity[key])
        ne_recognization[key] = len(prediction_name_entity[key]&label_name_entity[key])
        ne_irrelevant[key] = len(prediction_name_entity[key]-label_name_entity[key])
    return ne_count, ne_recognization, ne_irrelevant

def get_name_entity(labels):
    name_entity_dict = {}
    for key in ids_to_labels.keys():
        if key%2==1:
            name_entity_dict[ids_to_labels[key]]=set()
    tem_label = None
    for num, label in enumerate(labels):
        if tem_label is not None:
            if num == len(labels)-1 and label == tem_label + 1:
                name_entity_dict[ids_to_labels[tem_label]].add((start, num))
                break
            if label != tem_label + 1:
                name_entity_dict[ids_to_labels[tem_label]].add((start, num - 1))
                tem_label = None 
        if label % 2 != 0:
            tem_label = label
            start = num
            if num == len(labels)-1:
                name_entity_dict[ids_to_labels[tem_label]].add((start, num))
                break
    return name_entity_dict

In [22]:
for epoch in range(EPOCHS):
    print("--------------------------------------------------------")
    print(f"Training epoch: {epoch + 1}")
    train(epoch)
    labels, predictions = valid(model, testing_loader)

--------------------------------------------------------
Training epoch: 1
Training loss epoch: 0.8518817779692737
Training accuracy epoch: 0.7735510874426759
Name Entity Count: 7022, Name Entity Recognization: 933, Name Entity Irrelevant: 3095
Recall Ratio: 0.1328681287382512  Precision Ratio: 0.23162859980139028
F1 Score: 0.16886877828054297
B-Auxiliary********************
Name Entity Count: 139, Name Entity Recognization: 0, Name Entity Irrelevant: 0
Recall Ratio: 0.0  Precision Ratio: 0
F1 Score: 0
B-application********************
Name Entity Count: 23, Name Entity Recognization: 0, Name Entity Irrelevant: 0
Recall Ratio: 0.0  Precision Ratio: 0
F1 Score: 0
B-component********************
Name Entity Count: 160, Name Entity Recognization: 0, Name Entity Irrelevant: 1
Recall Ratio: 0.0  Precision Ratio: 0.0
F1 Score: 0
B-material********************
Name Entity Count: 206, Name Entity Recognization: 71, Name Entity Irrelevant: 147
Recall Ratio: 0.3446601941747573  Precision Ratio: 

In [23]:
def get_name_entity(labels):
    name_entity_dict = {}
    for key in ids_to_labels.keys():
        if key%2==1:
            name_entity_dict[ids_to_labels[key]]=set()
    tem_label = None
    for num, label in enumerate(labels):
        if tem_label is not None:
            if num == len(labels)-1 and label == tem_label + 1:
                name_entity_dict[ids_to_labels[tem_label]].add((start, num+1))
                break
            if label != tem_label + 1:
                name_entity_dict[ids_to_labels[tem_label]].add((start, num))
                tem_label = None 
        if label % 2 != 0:
            tem_label = label
            start = num
            if num == len(labels)-1:
                name_entity_dict[ids_to_labels[tem_label]].add((start, num+1))
                break
    return name_entity_dict

In [24]:
# 获得未标注的文章
artical_df = pd.read_excel(r'C:\Users\us_Ma\Desktop\Share\total_without_tag.xls')
artical_df = artical_df.dropna(subset=['abstract'])
for index, row in artical_df.iterrows():
    artical_id = row[0]
    artical_content = row[1]
    if "No abstract available" in artical_content or "For abstract, see" in artical_content:
        artical_df.drop(index, inplace=True)
artical_df = artical_df.reset_index(drop=True)

In [25]:
name_entity_df = pd.DataFrame(columns=['name_entity','type','artical_id'])
for index in range(len(total_set)):
    labels = total_set[index]['labels']
    ids = total_set[index]['input_ids']
    mappings = total_set[index]['offset_mapping']
    sentence = data.sentence[index]
    artical_id = data.artical_id[index]    
    active_label = []
    for label, mapping in zip(labels.cpu().numpy(), mappings.squeeze().tolist()):
        if mapping[0] == 0 and mapping[1] != 0:
            active_label.append(label)
        else:
            continue
    name_entity_dict = get_name_entity(active_label)
    for key in name_entity_dict.keys():
        for name_entity in name_entity_dict[key]:
            name_entity_single = pd.DataFrame({
                'name_entity' : ' '.join(sentence.split()[name_entity[0]:name_entity[1]]),
                'type' : key,
                'artical_id' : artical_id
            }, index=[0])
            name_entity_df = name_entity_df.append(name_entity_single, ignore_index=True)

In [26]:
def chemical_convert(matched):
    string = matched.group()
    string = string.replace(' sub ', '_')
    string = string.replace(' sup ', '^')
    string = string.replace(' ','.')
    return string

def clear_sub(matched):
    return ' sub '
def clear_blacket(matched):
    string = matched.group()
    string = string.replace('(',' ')
    string = string.replace(')',' ')
    string = string.replace('  ',' ')
    return string
def txt_convent(txt_total):
    txt_total = re.sub(r'\\sub ', clear_sub, txt_total)
    txt_total = re.sub(r'\/sub ', clear_sub, txt_total)
    txt_total = re.sub('\S+\(su[bp] \S+\)', clear_blacket, txt_total)
    txt_total = re.sub('\S+ \(su[bp] \S+\)', clear_blacket, txt_total)
    txt_total = re.sub(r'\S+ sub \S+ \S+ sub \S+ \S+ sub \S+ \S+ sub \S+ \S+ sub \S+', chemical_convert, txt_total)
    txt_total = re.sub(r'\S+ sub \S+ \S+ sub \S+ \S+ sub \S+ \S+ sub \S+', chemical_convert, txt_total)
    txt_total = re.sub(r'\S+ sub \S+ \S+ sub \S+ \S+ sub \S+', chemical_convert, txt_total)
    txt_total = re.sub(r'\S+ sub \S+ \S+ sub \S+', chemical_convert, txt_total)
    txt_total = re.sub(r'\S+ sup \S+', chemical_convert, txt_total)
    txt_total = re.sub(r'\S+ sub \S+', chemical_convert, txt_total)
    txt_total = re.sub(r'\S+ sub \S+', chemical_convert, txt_total)
    return txt_total


In [27]:
for index, row in artical_df.iterrows():
    artical_id = row[0]
    artical_content = txt_convent(row[1])
    if artical_id not in data.artical_id:
        for sentence in nltk.sent_tokenize(artical_content):
            sentence_tokenize = nltk.word_tokenize(sentence)
            inputs = tokenizer(sentence_tokenize,
                        is_split_into_words=True,
                        return_offsets_mapping=True, 
                        padding='max_length', 
                        truncation=True, 
                        max_length=MAX_LEN,
                        return_tensors="pt")
            # move to gpu
            ids = inputs["input_ids"].to(device)
            mask = inputs["attention_mask"].to(device)
            # forward pass
            outputs = model(ids, attention_mask=mask)
            logits = outputs[0]

            # model.num_labels: the numbers of labels 
            active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)

            # torch.argmax returns the indices of the maximum value of all elements
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
            prediction = []
            for pred, mapping in zip(flattened_predictions.cpu().numpy(), inputs["offset_mapping"].squeeze().tolist()):
                if mapping[0] == 0 and mapping[1] != 0:
                    prediction.append(pred)
                else:
                    continue
            name_entity_dict = get_name_entity(prediction)
            for key in name_entity_dict.keys():
                for name_entity in name_entity_dict[key]:
                    name_entity_single = pd.DataFrame({
                        'name_entity' : ' '.join(sentence_tokenize[name_entity[0]:name_entity[1]]),
                        'type' : key,
                        'artical_id' : artical_id
                    }, index=[0])
                    name_entity_df = name_entity_df.append(name_entity_single, ignore_index=True)

In [28]:
pd.set_option('display.max_rows', 20)
name_entity_df.name_entity.value_counts()

GaAs                         1071
silicon                       733
Si                            431
defects                       275
InP                           241
                             ... 
floating gate devices           1
MIOS                            1
retention time                  1
nonvolatile memory device       1
ATOMINDEX                       1
Name: name_entity, Length: 47812, dtype: int64

In [29]:
total_df = pd.read_excel(r'C:\Users\us_Ma\Desktop\Share\total.xls')
total_df = total_df[['paperid','publishdate']]
total_df = total_df.drop_duplicates()

In [30]:
name_entity_total_df = pd.merge(name_entity_df, total_df, left_on='artical_id', right_on='paperid',how='left')[['name_entity','type','artical_id','publishdate']]
name_entity_total_df.type=name_entity_total_df.type.map(lambda x:x[2:])
name_entity_total_df.to_csv(r'C:\Users\us_Ma\Desktop\Share\name_entity.csv')
name_entity_total_df

Unnamed: 0,name_entity,type,artical_id,publishdate
0,electron gas,material,ADA154997,1985.0
1,dielectric function,technology,ADA154997,1985.0
2,EMC,technology,ADA154997,1985.0
3,Ensemble Monte Carlo,technology,ADA154997,1985.0
4,scattering terms,technology,ADA154997,1985.0
...,...,...,...,...
99259,isotopes,theory,ZFK399,1980.0
99260,radioactive decay,theory,ZFK399,1980.0
99261,semiconductor detectors,component,ZFK399,1980.0
99262,detectors,component,ZFK399,1980.0
