In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm import tqdm
from numpy import nan
import math
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping


import transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class propress_data():
    def __init__(self, data, test_size_ratio=None):
        super().__init__()  ## 通过super方法调用继承父类的属性
        self.data = data
        
        if test_size_ratio is not None:
            self.test_size_ratio = test_size_ratio
        else:
            self.test_size_ratio = 0.2
    
    def preprocess_data(self):
        self.data['Sentence #'] = self.data['Sentence #'].ffill()
        data = self.data[['Sentence #','Word','Tag']]
        data.rename(columns={'Sentence #':'Sentence','Word':'Token','Tag':'POS'},inplace=True)
        return data
        
    def over_sampling(self):
        data = self.preprocess_data().copy()
        lst = [data]
        max_size = 1356
        for class_index, group in data.groupby('POS'):
                if len(group)< max_size:
                    lst.append(group.sample(max_size-len(group), replace=True))
        train_sample = pd.concat(lst)
        train_sample = train_sample.sort_values(by='Sentence')
        return train_sample
    
    # covert label(string) to int
    # e.g. "O" ---> 16
    def process_data(self):
        data = self.over_sampling()
        enc_pos = preprocessing.LabelEncoder()
        data.loc[:, "POS"] = enc_pos.fit_transform(data["POS"])
        sentences = data.groupby("Sentence")["Token"].apply(list).values
        pos = data.groupby("Sentence")["POS"].apply(list).values
        return sentences, pos, enc_pos
    
    
    def split_train_test(self):
        data = self.process_data()[0]
        label = self.process_data()[1]
        X_train,X_test,y_train,y_test = train_test_split(data,label,random_state=42,test_size = self.test_size_ratio)
        return X_train,X_test,y_train,y_test
    

In [3]:
class embedding():
    def __init__(self):
        super().__init__()  ## 通过super方法调用继承父类的属性
        self.MAX_LEN = 128
    


    def tokenize(self,data,labels):
        # Using BERT Auto-Tokenizer to embedding
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        
        input_ids, attention_masks, label_ids = [], [], []
        
        for i in tqdm(range(len(data))):

            ## drop nan
            data[i] =[x for x in data[i] if pd.notnull(x)]
            tokenized = tokenizer(
                                    data[i],
                                    padding="max_length",
                                    truncation=True,
                                    max_length=self.MAX_LEN,
                                    return_tensors="tf",
                                )
             
            input_ids.append(tokenized["input_ids"][0])
            attention_masks.append(tokenized["attention_mask"][0])


            if len(labels[i]) > self.MAX_LEN:
                label_ids.append(labels[i][:self.MAX_LEN])
                
            # Padding标签序列
            else:
                label_ids.append(
                    labels[i] + [0] * (self.MAX_LEN - len(labels[i]))  # 0为填充标签
                )
        return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_masks), tf.convert_to_tensor(label_ids)
        # return input_ids,attention_masks,label_ids
    
     
    
    

        

In [8]:
# import tensorflow as tf
# from transformers import BertTokenizer, TFBertModel
# from tensorflow_addons.text import crf_log_likelihood, crf_decode
# from tensorflow.keras.layers import LSTM, Dense, Bidirectional
# from tensorflow.keras.optimizers import Adam
# import numpy as np
# # 参数设置
# # MAX_SEQ_LENGTH = 128
# NUM_LABELS = 16  # 假设有10个标签类别
# BATCH_SIZE = 64
# EPOCHS = 2
# PATIENCE = 3  # Early Stopping 的耐心值
# # 加载BERT分词器和模型
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# bert_model = TFBertModel.from_pretrained(r'C:\Users\u1158286\OneDrive - IQVIA\Desktop\NER\model\bert-base-uncased',from_pt = True)


# # 使用 tf.data.Dataset 创建批次数据集
# train_dataset = tf.data.Dataset.from_tensor_slices(
#    (input_ids_train, attention_mask_train, label_ids_train)
# ).shuffle(buffer_size=100).batch(BATCH_SIZE)
# val_dataset = tf.data.Dataset.from_tensor_slices(
#    (input_ids_test, attention_mask_test, label_ids_test)
# ).batch(BATCH_SIZE)




# # 定义NER模型
# class NERModel(tf.keras.Model):
#    def __init__(self, bert_model, num_labels):
#        super(NERModel, self).__init__()
#        self.bert = bert_model
#        self.bilstm = Bidirectional(LSTM(128, return_sequences=True))
#        self.classifier = Dense(num_labels)
#        self.transition_params = tf.Variable(
#            tf.random.uniform(shape=(num_labels, num_labels))
#        )
#    def call(self, inputs, training=False):
#        input_ids, attention_mask = inputs
#        bert_output = self.bert(input_ids, attention_mask=attention_mask)[0]
#        lstm_output = self.bilstm(bert_output)
#        logits = self.classifier(lstm_output)
#        return logits
#    def compute_loss(self, logits, labels, seq_lengths):
#        log_likelihood, self.transition_params = crf_log_likelihood(
#            logits, labels, seq_lengths, self.transition_params
#        )
#        return -tf.reduce_mean(log_likelihood)
# # 创建模型实例
# ner_model = NERModel(bert_model, NUM_LABELS)
# optimizer = Adam(learning_rate=3e-5)
# # 定义 Early Stopping 和 Checkpoint
# best_val_loss = float("inf")
# patience_counter = 0  # Early Stopping 计数器
# checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=ner_model)
# checkpoint_manager = tf.train.CheckpointManager(checkpoint, "./checkpoints", max_to_keep=1)
# @tf.function
# def train_step(input_ids, attention_masks, label_ids):
#    with tf.GradientTape() as tape:
#        logits = ner_model((input_ids, attention_masks), training=True)
#        seq_lengths = tf.reduce_sum(attention_masks, axis=1)
#        loss = ner_model.compute_loss(logits, label_ids, seq_lengths)
#    gradients = tape.gradient(loss, ner_model.trainable_variables)
#    optimizer.apply_gradients(zip(gradients, ner_model.trainable_variables))
#    return loss
# @tf.function
# def val_step(input_ids, attention_masks, label_ids):
#    logits = ner_model((input_ids, attention_masks), training=False)
#    seq_lengths = tf.reduce_sum(attention_masks, axis=1)
#    loss = ner_model.compute_loss(logits, label_ids, seq_lengths)
#    return loss
# # 训练循环
# for epoch in range(EPOCHS):
#    print(f"Epoch {epoch + 1}/{EPOCHS}")
#    train_loss_avg = tf.keras.metrics.Mean()
#    val_loss_avg = tf.keras.metrics.Mean()
#    # 训练步骤
#    for batch_input_ids, batch_attention_masks, batch_label_ids in train_dataset:
#        loss = train_step(batch_input_ids, batch_attention_masks, batch_label_ids)
#        train_loss_avg.update_state(loss)
#    # 验证步骤
#    for val_input_ids_batch, val_attention_masks_batch, val_label_ids_batch in val_dataset:
#        val_loss = val_step(val_input_ids_batch, val_attention_masks_batch, val_label_ids_batch)
#        val_loss_avg.update_state(val_loss)
#    # 获取 epoch 损失
#    train_loss = train_loss_avg.result()
#    val_loss = val_loss_avg.result()
#    print(f"Train Loss: {train_loss.numpy()}, Validation Loss: {val_loss.numpy()}")
#    # Early Stopping 检查
#    if val_loss < best_val_loss:
#        best_val_loss = val_loss
#        patience_counter = 0
#        checkpoint_manager.save()
#        print(f"New best model saved with validation loss: {best_val_loss.numpy()}")
#    else:
#        patience_counter += 1
#        print(f"Patience Counter: {patience_counter}")
#    if patience_counter >= PATIENCE:
#        print("Early stopping triggered.")
#        break
# print("训练完成！")