In [3]:
import numpy as np
import pandas as pd
import re
from transformers import BertTokenizer, BertModel
import torch
from transformers import AdamW
from datasets import list_metrics,load_metric
from sklearn.metrics import confusion_matrix
import plotly.express as px
from pylab import *
import warnings
warnings.filterwarnings("ignore")

# Dataset reading
We seperating the dataset line by lline with the corresponding features

In [4]:
def read_data(path):
    sentence_list = []
    e12_list = []
    label_list = []
    comment_list = []
    ID_list = []
    count=0
    tag_list = ["<e1>","</e1>","<e2>","</e2>"]
    with open(path, "r") as f:
        lines = f.readlines()
        for line_index in range(0,len(lines),4):
            # append id
            ID_list.append(int(line_index/4+1))
            
            # append e12
            try:
                e1 = lines[line_index][:-1].split("<e1>")[1].split("</e1>")[0]
                e2 = lines[line_index][:-1].split("<e2>")[1].split("</e2>")[0]
                e12_list.append([e1,e2])
            except:
                print(setence)
                return 0
            
            # append sentence
            setence = (lines[line_index][:-1].split("\t")[1])[1:-1]
            for tag in tag_list:
                setence = setence.replace(tag, '')
                
            sentence_list.append(setence)
            
            # append label and comment
            label_list.append(lines[line_index+1][:-1])
            comment_list.append(lines[line_index+2][9:-1])
#             count+=1
#             if count>100:
#                 print(ID_list)
#                 print(e12_list)
#                 print(sentence_list)
#                 print(comment_list)
#                 print(label_list)
#                 return 0
        return ID_list,sentence_list,e12_list,label_list,comment_list

In [5]:
train_path = "../input/text-classification/TRAIN_FILE.TXT"
test_path = "../input/text-classification/FULL_TEST.txt"
ID_list,sentence_list,e12_list,label_list,comment_list = read_data(train_path)
ID_list_test,sentence_list_test,e12_list_test,label_list_test,comment_list_test = read_data(test_path)

The following are the feature we make:

In [6]:
display(ID_list[:3])

In [7]:
display(sentence_list[:3])

In [8]:
display(e12_list[:3])

In [9]:
display(label_list[:3])

In [10]:
display(comment_list[:3])

In [11]:
classes = list(set(label_list))
len(classes)

In [12]:
dic = {}
dic_rev = {}
label_listing = range(len(classes))
for i,j in zip(classes,label_listing):
    dic_rev[i] = j
    dic[j] = i
display(dic)
display(dic_rev)

# Data type convert

In [13]:
train_set = []
for i in range(len(sentence_list)):
    label = label_list[i]
    train_set.append([(" ".join(e12_list[i]),sentence_list[i]),dic_rev.get(label)])
train_set[0]

In [14]:
test_set = []
for i in range(len(sentence_list_test)):
    label = label_list_test[i]
    test_set.append([(" ".join(e12_list_test[i]),sentence_list_test[i]),dic_rev.get(label)])
test_set[0]

In [15]:
print("Total number of the training set is",len(ID_list))
print("Total number of the testing set is",len(ID_list_test))

# Named entity type
In this project, I will finish the task of named entity type, which is return the prediction for the labeled entity. 

Example:

"**American Airlines**, a unit of AMR, immediately matched the move, spokesman **Tim Wagner** said" 

1. E1: ORGANIZATION

2. E2: PERSON

3. E1+E2: ORGANIZATION-PERSON

## loading the tokenizer

In [None]:
token = BertTokenizer.from_pretrained('bert-large-uncased')
token

In this place, i treat the data as a QA question, input is [(sentence,keyword),answer] for each of the data.

In [None]:
# collate function
def collate_fn(data):
    # split the sentences and labels
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    # encoding, we loading the token of chinese
    # Batch coding
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,  # this is the sentences 
                                   truncation=True,                 # if exceed the max length, it will be cut
                                   padding='max_length',            # padding zero to the max length
                                   max_length=150,                   # max length of the sentence
                                   return_tensors='pt',             # the type (pythorch or tensorflow) used pt
                                   return_length=True)              # it will contain the length      

    # the number after the encoding
    input_ids = data['input_ids']
    # the place where we padding 0 is 0, other with orginial data is 1
    attention_mask = data['attention_mask']
    # since every sentence is one sentence, everything will be 0
    token_type_ids = data['token_type_ids']
    # modify the type of labels
    labels = torch.LongTensor(labels)
#     print(data)
#     print(data['length'], data['length'].max())
#     print(labels)
    return input_ids, attention_mask, token_type_ids, labels

# Using small batch training tends to converge to flat minimization
batch_size = 16

# data loader
loader = torch.utils.data.DataLoader(dataset=train_set,             # the input is the training set
                                     batch_size=batch_size,         # the number of data samples captured in one training
                                     collate_fn=collate_fn,         # use the collate function(Merge the data and labels of a batch)
                                     shuffle=True,                  # mix the data
                                     drop_last=True)                # Delete the incomplete last batch

# view the data , token_type_ids
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    break

print(len(loader))
input_ids.shape, attention_mask.shape , token_type_ids.shape, labels

In [None]:
# load the pre-trained model
pretrained = BertModel.from_pretrained('bert-large-uncased')

# # we just need to use it but not train it
# for param in pretrained.parameters():
#     param.requires_grad_(True)

# test for the pretrained data
out = pretrained(input_ids=input_ids,
           attention_mask=attention_mask,
           token_type_ids=token_type_ids)

# 16 batch size for the 200 max length, 1024 is the dimension of encoding
out.last_hidden_state.shape

In [None]:
next(iter(loader))[0].shape

In [None]:
# First take the pre training model for calculation, extract the features from the data, 
# and then put the features into the fully connected neural network for calculation

# Feature extraction based on pre training

# Transfer learning through downstream tasks
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # Single layer of totally connected neural network(2 means the classes number)
        self.device1 = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.device2 = torch.device("cuda")
        self.model = pretrained
        self.decoder = torch.nn.Linear(1024, 19)
        self.decoder.to(self.device1)
                        
#         self.bias = torch.nn.Parameter(torch.zeros(token.vocab_size))
#         self.bias.to(self.device1)
#         print(self.bias.device)
#         self.decoder.bias = self.bias
#         self.decoder.bias.to(self.device2)
#         print(self.decoder.bias.device)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        input_ids, attention_mask, token_type_ids = input_ids.to(self.device1), attention_mask.to(self.device1), token_type_ids.to(self.device1)
        pretrained.to(self.device1)
        out = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)
        # Only the features of the 0th word need to be used
        # [cls] is used for classification tasks and appears at the 0th index of the bert output
        out = self.decoder(out.last_hidden_state[:, 0])

        #out = out.softmax(dim=1)

        return out

model = Model()

model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape

In [None]:
# model.parameters

In [None]:
# training the model
optimizer = AdamW(model.parameters(), lr=5e-4)
# cross entropy loss as the criterion
criterion = torch.nn.CrossEntropyLoss()

x=[]
y_l=[]
y_a=[]

epochs = 50
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
count = 1
for epoch in range(epochs):
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        
#         input_ids = input_ids.to(device)
#         attention_mask = attention_mask.to(device)
#         token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)
        
        # get the output
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)
        
        out = out.to(device)
        
        # get the loss and minimize it
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(i)
        # save the model for every 16 times
        if i % 10 == 0:
#             out = out.to(torch.device("cpu"))
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)
            torch.save(model,"./bert_model.pt")

            # for each time, print the accuracy
            print(i, loss.item(), accuracy)

            x.append(count)
            y_l.append(loss.item())
            y_a.append(accuracy)
            count += 1

# 
#         if i == 6:
#            break
#     break

In [None]:
def ploting(name,x,y):
    plt.rcParams['figure.figsize'] = (12.0, 8.0) 
    plt.plot(x, y, 'r-', alpha=0.8, label=name)
    plt.legend(loc="upper right")
    plt.xlabel('iter')
    plt.ylabel(name)
    plt.show()

In [None]:
name_a = "Accuary"
name_l = "Loss"
ploting(name_a,x,y_a)

In [None]:
ploting(name_l,x,y_l)

# Evaluation

In [None]:
#print(load_metric("precision").inputs_description)

In [None]:
def performance(y_ture,y_pred):
    f1_metric = load_metric("f1")
    re_metric = load_metric("recall")
    pre_metric = load_metric("precision")
    type_c_int = list(set(np.concatenate([y_ture, y_pred])))
    type_c = [str(i) for i in type_c_int]
    
    f1_m_list = []
    re_m_list = []
    pre_m_list = []
    
    for i in type_c_int:
        bi_ture = list(y_ture == i)
        bi_pred = list(y_pred == i)
        f1_m_results = f1_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        re_m_results = re_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        pre_m_results = pre_metric.compute(predictions=bi_pred, references=bi_ture, average="macro")
        
        f1_m_list.append(f1_m_results["f1"])
        re_m_list.append(re_m_results["recall"])
        pre_m_list.append(pre_m_results["precision"])
        
    data = {'Class_type':type_c_int,'F1-macro':f1_m_list,'Recall-macro':re_m_list,'Precision-macro':pre_m_list}
    df = pd.DataFrame(data)
    display(df)
    
    
    z = confusion_matrix(y_ture, y_pred)
    x_lab = type_c

    fig = px.imshow(z, 
                    text_auto=True,
                    labels=dict(x="True label", y="Predicted label", color="times"),
                    x=x_lab,
                    y=x_lab)
    fig.show()
    
    return z
    
cf_matrix_test = performance([1,3,1,4,2,1],[2,3,1,3,3,2])

In [None]:
def test(model,if_load):
    y_pred = []
    y_true = []
    if(if_load):
        model = torch.load('/kaggle/input/model-pt/bert_model.pt')
    try:
        model.eval()
    except:
        return -1
    correct = 0
    total = 0
    
    # loading the test set
    loader_test = torch.utils.data.DataLoader(dataset=test_set,
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)
    
    # use for each
    for i, (input_ids, attention_mask, token_type_ids,labels) in enumerate(loader_test):

#         if i == 2:
#             break
        if i % 10 == 0:
            print(i)
        # get the output by the model
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)
        
        out = out.to(device)
        
        # get the argument max
        out = out.argmax(dim=1)
        # calculate the correct number and the total number
        out_list = out.tolist()
        labels_list = labels.tolist()
        y_pred.extend(out_list)
        y_true.extend(labels_list)
    # print the final Result
    return y_true,y_pred

if_load = 0
y_t,y_p = test(model,if_load)

In [None]:
cf_matrix = performance(y_t,y_p)