In [1]:
import re
import string
from tqdm import tqdm
import json
import pandas as pd
from collections import defaultdict
from gensim.models import KeyedVectors
import gensim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torch
import torch.nn as nn
from collections import Counter
from multiprocessing import Pool
import math
import gc
import gzip
import wordninja

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        file = json.load(f)
    return file

def write_json(json_file, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(json_file, f)


def build_word2id_tag2id(data_path, 
                         case=True, 
                         pretrain_vector_path=None, 
                         max_word2id_size=None, 
                         max_char2id_size=None, 
                         min_word_freq=1, 
                         min_char_freq=1):
    '''
    case: 区分大小写, 默认为不区分
    pretrain_vector: 预训练词向量的path, 以便生成word2vec中词汇的word2id
    max_word2id_size: 最大词表大小
    max_char2id_size: 最大字符表大小
    min_word_freq: 最小词频
    min_char_freq: 最小字符频
    '''
    # 生成 word2id
    word_freq = defaultdict(int)
    char_freq = defaultdict(int)
    word2id = {}
    tag2id = {}
    char2id = {}
    if not pretrain_vector_path:
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in tqdm(f):
                sentence = json.loads(line)
                text = sentence['text']
                tags = sentence['label']
                for word, tag in zip(text, tags):
                    if tag not in tag2id:
                        tag2id[tag] = len(tag2id)
                    if case:
                        word_freq[word] += 1
                    else:
                        word_freq[word.lower()] += 1
                    for char in word:
                        if case:
                            char_freq[char] += 1
                        else:
                            char_freq[char.lower()] += 1
        vocab_list = [(word, freq) for word, freq in word_freq.items() if freq >= min_word_freq]
        vocab_list.sort(key=lambda x: x[1], reverse=True)
        char_list = [(char, freq) for char, freq in char_freq.items() if freq >= min_char_freq]
        if max_word2id_size:
            vocab_list = vocab_list[:max_word2id_size]
        if max_char2id_size:
            char_list = char_list[:max_char2id_size]
        word2id = {'<PAD>': 0, '<UNK>': 1}
        char2id = {'<PAD>': 0, '<UNK>': 1}
        word2id.update({word_count[0]: idx+2 for idx, word_count in enumerate(vocab_list)})
        char2id.update({char_count[0]: idx+2 for idx, char_count in enumerate(char_list)})
    else:
        model = KeyedVectors.load_word2vec_format(pretrain_vector_path, binary=False)
        word2id = {'<PAD>': 0, '<UNK>': 1}
        if tuple(map(int, gensim.__version__.split('.'))) > (4,0,0):
            vocab = model.key_to_index
        else:
            vocab = model.wv.vocab
        for idx,i in enumerate(list(vocab)):
            word2id[i] = idx + 2
        with open(data_path,'r') as f:
            for line in f:
                sentence = json.loads(line)
                tags = sentence['label']
                text = text = sentence['text']
                for word, tag in zip(text, tags):
                    if tag not in tag2id:
                        tag2id[tag] = len(tag2id)
                    for char in word:
                        if case:
                            char_freq[char] += 1
                        else:
                            char_freq[char.lower()] += 1
        char_list = [(char, freq) for char, freq in char_freq.items() if freq >= min_char_freq]
        if max_char2id_size:
            char_list = char_list[:max_char2id_size]
        char2id = {'<PAD>': 0, '<UNK>': 1} 
        char2id.update({char_count[0]: idx+2 for idx, char_count in enumerate(char_list)})

    id2word = {v:k for k,v in word2id.items()}
    id2tag = {v:k for k,v in tag2id.items()}
    id2char = {v:k for k,v in char2id.items()}

    write_json(word2id, './conf/word2id.json')
    write_json(char2id, './conf/char2id.json')
    write_json(tag2id, './conf/tag2id.json')
    write_json(id2word, './conf/id2word.json')
    write_json(id2tag, './conf/id2tag.json')
    write_json(id2char, './conf/id2char.json')
    
    return word2id, tag2id, char2id

In [7]:
word2id, tag2id, char2id = build_word2id_tag2id('./data/process_data/train.json',pretrain_vector_path='./conf/filter_un_cbow_20230824', min_char_freq=10, min_word_freq=10)

In [23]:
class MyDataset(Dataset):
    def __init__(self, data_path, word2id, tag2id, char2id, max_sentence_length=64, max_word_length=20, with_label=True):
        self.with_label = with_label
        self.word2id = word2id
        self.char2id = char2id
        self.texts = []
        self.data_path = data_path
        self.max_word_length = max_word_length
        if self.with_label:
            self.tag2id = tag2id
            self.labels = []
            with open(data_path,'r') as f:
                for line in f:
                    sentence = json.loads(line)
                    self.texts.append(sentence['text'])
                    self.labels.append(sentence['label'])
        else:
            with open(data_path,'r') as f:
                for line in f:
                    sentence = json.loads(line)
                    self.texts.append(sentence['text'])
        # mask
        self.mask = []
        for sentence in self.texts:
            mask = [1] * len(sentence)
            self.mask.append(mask)

        if self.with_label:
            for i in range(len(self.texts)):
                length = len(self.texts[i])
                if length < max_sentence_length:
                    pad_length = max_sentence_length - length
                    self.texts[i].extend(['<PAD>'] * pad_length)
                    self.labels[i].extend(['O'] * pad_length)
                    self.mask[i].extend([0] * pad_length)
                else:
                    self.texts[i] = self.texts[i][:max_sentence_length]
                    self.labels[i] = self.labels[i][:max_sentence_length]
                    self.mask[i] = self.mask[i][:max_sentence_length]
        else:
            for i in range(len(self.texts)):
                length = len(self.texts[i])
                if length < max_sentence_length:
                    pad_length = max_sentence_length - length
                    self.texts[i].extend(['<PAD>'] * pad_length)
                    self.mask[i].extend([0] * pad_length)
                else:
                    self.texts[i] = self.texts[i][:max_sentence_length]
                    self.mask[i] = self.mask[i][:max_sentence_length]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if self.with_label:
            label = self.labels[idx]
            label_to_id = []
        sentence = self.texts[idx]
        mask = self.mask[idx]
        sentence_to_id = []
        char_to_id = []
        if self.with_label:
            for word, tag, m in zip(sentence, label, mask):
                if word in self.word2id:
                    sentence_to_id.append(self.word2id[word])
                else:
                    sentence_to_id.append(self.word2id['<UNK>'])
                word_length = len(word)
                word_to_char_to_idx = []
                if len(word) <= self.max_word_length:
                    chars_list = [char for char in word] + ['<PAD>'] * (self.max_word_length - word_length)
                    for char in chars_list:
                        char_id = self.char2id.get(char, self.char2id['<UNK>'])
                        word_to_char_to_idx.append(char_id)
                else:
                    chars_list = [char for char in word[:self.max_word_length]]
                    for char in chars_list:
                        char_id = self.char2id.get(char, self.char2id['<UNK>'])
                        word_to_char_to_idx.append(char_id)
                char_to_id.append(word_to_char_to_idx)
                label_to_id.append(self.tag2id[tag])
            return torch.LongTensor(sentence_to_id), torch.LongTensor(char_to_id), torch.LongTensor(label_to_id), torch.tensor(mask).bool()
        else:
            for word,  m in zip(sentence, mask):
                if word in self.word2id:
                    sentence_to_id.append(self.word2id[word])
                else:
                    sentence_to_id.append(self.word2id['<UNK>'])
                word_length = len(word)
                word_to_char_to_idx = []
                if len(word) <= self.max_word_length:
                    chars_list = [char for char in word] + ['<PAD>'] * (self.max_word_length - word_length)
                    for char in chars_list:
                        char_id = self.char2id.get(char, self.char2id['<UNK>'])
                        word_to_char_to_idx.append(char_id)
                else:
                    chars_list = [char for char in word[:self.max_word_length]]
                    for char in chars_list:
                        char_id = self.char2id.get(char, self.char2id['<UNK>'])
                        word_to_char_to_idx.append(char_id)
                char_to_id.append(word_to_char_to_idx)
            return torch.LongTensor(sentence_to_id), torch.LongTensor(char_to_id), torch.tensor(mask).bool()

In [160]:
import torch.nn as nn
import torch
from gensim.models import KeyedVectors
import numpy as np
from torchcrf import CRF

class BilstmCnnCrf(nn.Module):
    def __init__(self, 
                word2id,
                char2id, 
                num_classes, 
                word_embedding_dim=300, 
                char_embedding_dim=20,
                num_filters=30,
                hidden_dim=200, 
                num_layers=2,
                filter_size=3,
                drop_out=0.5,
                embedding_pretrained=None, 
                pretrain_vector_path=None):
        super(BilstmCnnCrf, self).__init__()
        self.WordEmbedding = WordEmbedding(word2id=word2id, 
                                         word_embedding_dim=word_embedding_dim, 
                                         embedding_pretrained=embedding_pretrained, 
                                         pretrain_vector_path=pretrain_vector_path)
        self.CharEmbedding = CharEmbedding(char2id=char2id,
                                           char_embedding_dim=char_embedding_dim,
                                            num_filters=num_filters,
                                            filter_size=filter_size)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn_input_dim = word_embedding_dim + num_filters
        self.lstm = nn.LSTM(self.rnn_input_dim, hidden_dim // 2, num_layers, bidirectional=True, batch_first=True, dropout=drop_out)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.crf = CRF(num_classes, batch_first=True)

    def forward(self, words_to_id, chars_to_id):
        word_embedding = self.WordEmbedding(words_to_id)
        char_embedding = self.CharEmbedding(chars_to_id)
        embedding = torch.cat([word_embedding, char_embedding],2)
        self.lstm.flatten_parameters()
        out, (h, c) = self.lstm(embedding)
        out = self.fc(out)
        return out

    def compute_loss(self, words_to_id, chars_to_id, label, mask):
        out = self.forward(words_to_id, chars_to_id)
        loss = -self.crf(out, label, mask, reduction='mean')
        return loss

    def decode(self, words_to_id, chars_to_id, mask):
        out = self.forward(words_to_id, chars_to_id)
        predicted_id = self.crf.decode(out, mask)
        return predicted_id
    

class WordEmbedding(nn.Module):
    def __init__(self, 
                word2id,
                word_embedding_dim=300, 
                embedding_pretrained=None, 
                pretrain_vector_path=None):
        super(WordEmbedding, self).__init__()
        if embedding_pretrained:
            self.word_embedding = nn.Embedding(len(word2id), word_embedding_dim)
            model = KeyedVectors.load_word2vec_format(pretrain_vector_path, binary=False)
            pretrained_embeddings = model.vectors
            new_nd = np.zeros((2,300))
            pretrained_embeddings = np.r_[new_nd, pretrained_embeddings]
            self.word_embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            self.word_embedding.weight.requires_grad = False
        else:
            self.word_embedding = nn.Embedding(len(word2id), word_embedding_dim, padding_idx=0)
    
    def forward(self, x):
        out = self.word_embedding(x) # [batch_size, max_length, embedding_dim]
        return out 

class CharEmbedding(nn.Module):
    def __init__(self, 
            char2id,
            char_embedding_dim=20,
            num_filters=30,
            filter_size=3):
        super(CharEmbedding, self).__init__()

        self.embedding_dim = char_embedding_dim
        self.char_embedding = nn.Embedding(len(char2id), char_embedding_dim)
        self.cnn = nn.Conv3d(in_channels=1, out_channels=num_filters, kernel_size=(1, filter_size, char_embedding_dim))
 
    def forward(self, inputs):
        max_len, max_len_char = inputs.size(1), inputs.size(2)
        inputs = inputs.view(-1, max_len * max_len_char)
        input_embed = self.char_embedding(inputs)
        input_embed = input_embed.view(-1, 1, max_len, max_len_char, self.embedding_dim)

        conv_output = self.cnn(input_embed)
        pool_output = torch.squeeze(torch.max(conv_output, -2)[0])

        out = pool_output.transpose(-2, -1).contiguous() # [batch_size, max_length, num_filters]
        return out

In [161]:
model = BilstmCnnCrf(
                word2id,
                char2id, 
                num_classes=len(tag2id), 
                word_embedding_dim=300, 
                char_embedding_dim=20,
                num_filters=30,
                hidden_dim=200, 
                num_layers=2,
                filter_size=3,
                drop_out=0.5,
                embedding_pretrained=None, 
                pretrain_vector_path=None)

In [162]:
data_set = MyDataset('./data/process_data/train.json', word2id, tag2id, char2id, max_sentence_length=64, max_word_length=20, with_label=True)

In [163]:
dataloader = DataLoader(data_set,batch_size=128)

In [147]:
import numpy as np
class WordEmbedding(nn.Module):
    def __init__(self, 
                word2id,
                embedding_dim=300, 
                embedding_pretrained=None, 
                pretrain_vector_path=None):
        super(WordEmbedding, self).__init__()
        if embedding_pretrained:
            self.word_embedding = nn.Embedding(len(word2id), embedding_dim)
            model = KeyedVectors.load_word2vec_format(pretrain_vector_path, binary=False)
            pretrained_embeddings = model.vectors
            new_nd = np.zeros((2,300))
            pretrained_embeddings = np.r_[new_nd, pretrained_embeddings]
            self.word_embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
            self.word_embedding.weight.requires_grad = False
        else:
            self.word_embedding = nn.Embedding(len(word2id), embedding_dim, padding_idx=0)
    
    def forward(self, x):
        out = self.word_embedding(x)
        print(out.shape)
        return out

In [148]:
worde = WordEmbedding(word2id,embedding_pretrained=True,pretrain_vector_path='./conf/filter_un_cbow_20230824')

In [149]:
for (words_to_id, chars_to_id, label, mask) in dataloader:
    worde(words_to_id)
    break

torch.Size([128, 64, 300])
