In [2]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from typing import Tuple, List
from random import shuffle,randint
from tqdm import tqdm

In [3]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"

EMBEDDING_DIM = 5
HIDDEN_DIM = 4
DEVICE=torch.device("cpu")

cn_tags=['O', 
         'B-NAME', 'M-NAME', 'E-NAME', 'S-NAME', 
         'B-CONT', 'M-CONT', 'E-CONT', 'S-CONT',
         'B-EDU', 'M-EDU', 'E-EDU', 'S-EDU', 
         'B-TITLE', 'M-TITLE', 'E-TITLE', 'S-TITLE',
         'B-ORG', 'M-ORG', 'E-ORG', 'S-ORG', 
         'B-RACE', 'M-RACE', 'E-RACE', 'S-RACE',
         'B-PRO', 'M-PRO', 'E-PRO', 'S-PRO', 
         'B-LOC', 'M-LOC', 'E-LOC', 'S-LOC',
         START_TAG, STOP_TAG]
cn_tag2idx={'O': 0, 
            'B-NAME': 1, 'M-NAME': 2, 'E-NAME': 3, 'S-NAME': 4, 
            'B-CONT': 5, 'M-CONT': 6, 'E-CONT': 7, 'S-CONT': 8,
            'B-EDU': 9, 'M-EDU': 10, 'E-EDU': 11, 'S-EDU': 12, 
            'B-TITLE': 13, 'M-TITLE': 14, 'E-TITLE': 15, 'S-TITLE': 16, 
            'B-ORG': 17, 'M-ORG': 18, 'E-ORG': 19, 'S-ORG': 20,
            'B-RACE': 21, 'M-RACE': 22, 'E-RACE': 23, 'S-RACE': 24, 
            'B-PRO': 25, 'M-PRO': 26, 'E-PRO': 27, 'S-PRO': 28, 
            'B-LOC': 29, 'M-LOC': 30, 'E-LOC': 31, 'S-LOC': 32,
            START_TAG: 33, STOP_TAG: 34}
en_tags=["O",
         "B-PER","I-PER",
         "B-ORG","I-ORG",
         "B-LOC","I-LOC",
         "B-MISC","I-MISC",
         START_TAG,STOP_TAG]
en_tag2idx={'O': 0,
            'B-PER': 1, 'I-PER': 2,
            'B-ORG': 3, 'I-ORG': 4, 
            'B-LOC': 5, 'I-LOC': 6, 
            'B-MISC': 7, 'I-MISC': 8,
            START_TAG: 9, STOP_TAG: 10}


cn_train="/Users/jiaruiye/Desktop/FDU/专业课程/必修课程/人工智能/Projects/PJ2/NER/Chinese/train.txt"
cn_test="/Users/jiaruiye/Desktop/FDU/专业课程/必修课程/人工智能/Projects/PJ2/NER/Chinese/validation.txt"
en_train="/Users/jiaruiye/Desktop/FDU/专业课程/必修课程/人工智能/Projects/PJ2/NER/English/train.txt"
en_test="/Users/jiaruiye/Desktop/FDU/专业课程/必修课程/人工智能/Projects/PJ2/NER/English/validation.txt"

def load_file(filename:str)->List[Tuple[List[str],List[str]]]:
    with open(filename, mode='r', encoding='utf-8') as f:
        lines = [line.strip() for line in f.readlines()]
    sentence_list=[]
    word_list=[]
    tag_list=[]
    for line in lines:
        if len(line)==0:
            sentence_list.append((word_list,tag_list))
            word_list=[]
            tag_list=[]
        else:
            word,tag=line.split()
            word_list.append(word)
            tag_list.append(tag)
    if not len(word_list)==0:
        sentence_list.append((word_list,tag_list))
    return sentence_list

def create_word2idx(sentence_list:List[Tuple[List[str],List[str]]])->dict:
    word2idx={}
    for word_list,_ in sentence_list:
        for word in word_list:
            if not word in word2idx:
                word2idx[word]=len(word2idx)
    # word2idx["<UNK>"]=len(word2idx)
    return word2idx
def log_sum_exp(vec):
    # max_score = vec[0][torch.argmax(vec).item]
    max_score=torch.max(vec)
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))


In [4]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim) -> None:
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag2idx = tag2idx
        self.tagset_size = len(tag2idx)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim).to(DEVICE)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True).to(DEVICE)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size).to(DEVICE)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size)).to(DEVICE)

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag2idx[START_TAG], :] = -10000
        self.transitions.data[:, tag2idx[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(DEVICE),
                torch.randn(2, 1, self.hidden_dim // 2).to(DEVICE))
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        # lstm_feats.shape=(len(sentence),self.tagset_size)
        return lstm_feats
    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(DEVICE)
        tags = torch.cat([torch.tensor([self.tag2idx[START_TAG]], dtype=torch.long).to(DEVICE), tags]).to(DEVICE)
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag2idx[STOP_TAG], tags[-1]]
        return score
    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self.viterbi(lstm_feats)
        return score, tag_seq
    def viterbi(self,feats):
        # feats.shape=(len(sentence),self.tagset_size)
        # 每个word对应的每个tag达到最大score时的上一个tag
        last_tags=[]
        # 当前word对应的每个tag的概率
        working_var=torch.full((1,self.tagset_size),-10000).to(DEVICE)
        working_var[0][self.tag2idx[START_TAG]]=0
        # feat.shape=(self.tagset_size)
        for feat in feats:
            # 记录每个tag达到最大score时的上一个tag
            temp_tag=[]
            # 记录每个tag达到的最大score
            temp_score=[]
            for tag in range(self.tagset_size):
                next_score=working_var+self.transitions[tag]
                _, idx = torch.max(next_score, 1)
                best_tag=idx.item()
                temp_tag.append(best_tag)
                temp_score.append(next_score[0][best_tag].view(1))
            # 更新working_var
            working_var=torch.cat(temp_score)+feat
            working_var=working_var.view(1,-1)
            last_tags.append(temp_tag)
    
        # 处理最后一个word的tag
        # 加上转移到STOP_TAG的概率
        working_var=working_var+self.transitions[self.tag2idx[STOP_TAG]]
        path_last_tag=torch.argmax(working_var).item()
        path_score=working_var[0][path_last_tag]
        
        path=[path_last_tag]
        for i in range(len(last_tags)-1,-1,-1):
            path.append(last_tags[i][path[-1]])

        assert path.pop()==self.tag2idx[START_TAG]
        path.reverse()
        return path_score,path
    def neg_log_likelihood(self,sentence,tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(DEVICE)
        # START_TAG has all of the score.
        init_alphas[0][self.tag2idx[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag2idx[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha
    

# Main

In [36]:
lang="en"
train_set=load_file(en_train)
print(train_set[0][0])
word2idx=create_word2idx(train_set)
tag2idx=en_tag2idx if lang=="en" else cn_tag2idx
idx2tag=en_tags if lang=="en" else cn_tags
model=BiLSTM_CRF(len(word2idx),tag2idx,EMBEDDING_DIM,HIDDEN_DIM).to(DEVICE)
optimizer=optim.Adam(model.parameters(),lr=0.005)
with torch.no_grad():
    precheck_sent = torch.tensor([word2idx[word] for word in train_set[0][0]],dtype=torch.long).to(DEVICE)
    precheck_tags = torch.tensor([tag2idx[tag] for tag in train_set[0][1]], dtype=torch.long).to(DEVICE)
    print(model(precheck_sent))

['Rangarajan', 'said', 'a', 'current', 'account', 'deficit', 'of', 'two', 'percent', 'brought', 'about', 'by', 'a', '16-17', 'percent', 'annual', 'growth', 'in', 'exports', 'and', 'a', '14-15', 'percent', 'rise', 'in', 'imports', 'along', 'with', 'an', 'increase', 'in', 'non-debt', 'flows', 'could', 'lead', 'to', 'a', 'reduction', 'in', 'the', 'debt-service', 'ratio', 'to', 'below', '20', 'percent', 'over', 'the', 'next', 'five', 'years', '.']
(tensor(80.0615), [1, 1, 6, 0, 1, 6, 0, 1, 1, 1, 6, 0, 1, 1, 6, 0, 1, 6, 0, 1, 1, 6, 0, 1, 6, 0, 1, 1, 1, 6, 0, 1, 6, 0, 1, 6, 0, 1, 6, 8, 7, 0, 1, 6, 0, 1, 6, 0, 1, 1, 6, 0])


# Train

In [37]:
model.load_state_dict(torch.load('en_BiLSTM_CRF.pt'))

<All keys matched successfully>

In [31]:
model.train()
for epoch in range(1):
    print("Epoch:{}".format(epoch))
    shuffle(train_set)
    tot_loss=0
    for words,tags in tqdm(train_set):
        model.zero_grad()
        words_int=torch.tensor([word2idx[word] for word in words],dtype=torch.long).to(DEVICE)
        tags_int=torch.tensor([tag2idx[tag] for tag in tags],dtype=torch.long).to(DEVICE)
        loss=model.neg_log_likelihood(words_int,tags_int)
        tot_loss+=loss.item()
        loss.backward()
        optimizer.step()
    print("train loss = {}".format(tot_loss/len(train_set)))
        

Epoch:0


 99%|█████████▊| 3765/3820 [12:08<00:10,  5.17it/s]


KeyboardInterrupt: 

In [19]:
torch.save(model.state_dict(), 'cn_BiLSTM_CRF.pt')

# Test

In [7]:
test_set=load_file(cn_test)
# test_set=load_file("test.txt")
for words,tags in test_set:
    print(len(words),end="\t")
    for word in words:
        print(word,end=" ")
    print()
model.eval()
words,tags=test_set[-1]
print(words,[tag2idx[tag] for tag in tags])
word_int=[]
for word in words:
    if word in word2idx:
        word_int.append(word2idx[word])
    else:
        word_int.append(randint(0,len(word2idx)-1))
score,path=model(torch.tensor(word_int,dtype=torch.long))
print(score,path)

4	Bradford 1 Tranmere 0 
39	" It appears that August is showing an economy again reversing course and is not moving onto a significantly slower track at this point , " said economist Lynn Reaser of Barnett Banks Inc. in Jacksonville , Fla . 
4	Result in an international 
8	But I think it 's not that . 
45	More than a thousand women , children and men gathered in a field to the north of the town of Urus-Martan on Saturday to wait for a column of rebels to withdraw from the capital Grozny about 25 km ( 12 miles ) away . 
7	Current 818 million 979 million 882 million 
8	Boatmen 's deal could spark more mergers . 
25	Ready copper fell by 150 rupees at 12,350 rupees per quintal on fresh offerings by the stockists who expect Hindustan Copper to cut prices . 
7	Mexican avocados not expected in U.S . 
33	Equipment problems and mechanical failure forced a recovery expedition to give up efforts to retrieve a giant slab of the RMS Titanic from the ocean floor , a spokeswoman said on Friday . 
14	

In [38]:
# test_set=load_file(cn_test)
test_set=load_file("/Users/jiaruiye/Desktop/FDU/专业课程/必修课程/人工智能/Projects/PJ2/english_test.txt")
# test_set=load_file("test.txt")
model.eval()
word_count=0
error_count=0
f=open("/Users/jiaruiye/Desktop/FDU/专业课程/必修课程/人工智能/Projects/PJ2/en_BiLSTM_CRF_my_result.txt","w",encoding="utf-8")
for sentence_idx in tqdm(range(len(test_set))):
    words,tags=test_set[sentence_idx]
    # print(len(words),words)
    word_int=[]
    for word in words:
        if word in word2idx:
            word_int.append(word2idx[word])
        else:
            word_int.append(randint(0,len(word2idx)-1))
    assert len(word_int)==len(words)
    score,path=model(torch.tensor(word_int,dtype=torch.long))
    assert len(tags)==len(path)
    assert len(tags)==len(words)
    for i in range(len(words)):
        f.write(words[i]+" "+idx2tag[path[i]]+"\n")
    if not sentence_idx == len(test_set)-1:
        f.write("\n")
    word_count+=len(words)
    for i in range(len(tags)):
        if not tag2idx[tags[i]]==path[i]:
            error_count+=1
if lang=='cn':
    f.write('\n')
f.close()
print("{}/{}".format(error_count,word_count))
print("Accuracy={:.2%}".format(1-error_count/word_count))
    

100%|██████████| 3453/3453 [00:17<00:00, 198.15it/s]

4686/46435
Accuracy=89.91%



