### BERT-BiLSTM-CRF

In [8]:
import torch
import pandas as pd
import numpy as np
import psutil
from pandarallel import pandarallel
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm import trange, tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torchcrf import CRF
from torch.nn import LSTM

# 初始化pandarallel
pandarallel.initialize(nb_workers=psutil.cpu_count(logical=False))

# 读取数据
df_label = pd.read_csv('data\WeiboTrainData\weibo_label.csv')

# 定义数据清洗函数
def clean_content(content):
    import re
    content = re.sub(r'分享图片|分享视频|微博视频|的微博视频|#|，|,|。|；|、|！|\+|=|-|&|:|%|\?', '', content)
    content = re.sub(r'[："\'/|……]', '', content)
    content = re.sub(r'\d+', '', content).replace(' ', '').replace('.', '')
    content = re.sub(r'[_\[\]【】<>《》（）\(\)]', '', content)
    content = re.sub(r'[a-zA-Z]+', '', content)
    return content

# 清洗数据
df_label['message'] = df_label['message'].parallel_apply(clean_content)

# 去除空的行
df_label= df_label[df_label['message'] != '']

# 保留text文本长度大于4的行
df_label = df_label[df_label['message'].str.len() > 4]

# 数据预处理
df_label.sentiment.value_counts()
df = df_label.copy()
df.loc[df[df.sentiment!=6].index,'sentiment'] = 1
df = df[(df['sentiment']==1) | (df['sentiment']==6)]
drop_size = len(df[df['sentiment']==1].sentiment) - len(df[df['sentiment']==6].sentiment)
df.drop(df[df['sentiment']==1].sample(drop_size).index, inplace=True)
df.loc[df[df.sentiment==6].index,'sentiment'] = 0
df.sentiment.unique()

# 划分训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(df['message'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment'])

# 加载BERT模型和分词器
model_name = 'bert-base-chinese'
config = BertConfig.from_pretrained('model/' + model_name)
tokenizer = BertTokenizer.from_pretrained('model/' + model_name)
bert_model = BertForSequenceClassification.from_pretrained('model/' + model_name, num_labels=2)
# bert_model = BertForSequenceClassification.from_pretrained('model/' + model_name, num_labels=768)

# 定义BERT-BiLSTM-CRF模型
class BertBiLSTMCRF(torch.nn.Module):                                      
    def __init__(self, bert_model, num_labels, hidden_dim):
        super(BertBiLSTMCRF, self).__init__()
        self.bert = bert_model
        self.lstm = LSTM(input_size=bert_model.config.hidden_size, hidden_size=hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim * 2, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        lstm_output, _ = self.lstm(sequence_output)
        emissions = self.fc(lstm_output)
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.byte())
            return loss
        else:
            return self.crf.decode(emissions, mask=attention_mask.byte())

# model = BertBiLSTMCRF(bert_model, num_labels=2, hidden_dim=256)
model = BertBiLSTMCRF(bert_model, num_labels=3, hidden_dim=256)

# 定义分词函数
def get_tokens(text, tokenizer, max_seq_length, add_special_tokens=True):
    input_ids = tokenizer.encode(text, add_special_tokens=add_special_tokens, truncation=True, max_length=max_seq_length, pad_to_max_length=True)
    attention_mask = [int(id > 0) for id in input_ids]
    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    return (input_ids, attention_mask)

# 对训练集和测试集进行分词
X_train_tokens = X_train.apply(get_tokens, args=(tokenizer, 150))
X_test_tokens = X_test.apply(get_tokens, args=(tokenizer, 150))

# 转换为PyTorch张量
input_ids_train = torch.tensor([features[0] for features in X_train_tokens.values], dtype=torch.long)
input_mask_train = torch.tensor([features[1] for features in X_train_tokens.values], dtype=torch.long)
label_ids_train = torch.tensor(Y_train.values, dtype=torch.long)

input_ids_test = torch.tensor([features[0] for features in X_test_tokens.values], dtype=torch.long)
input_mask_test = torch.tensor([features[1] for features in X_test_tokens.values], dtype=torch.long)
label_ids_test = torch.tensor(Y_test.values, dtype=torch.long)

# 创建数据集
train_dataset = TensorDataset(input_ids_train, input_mask_train, label_ids_train)
test_dataset = TensorDataset(input_ids_test, input_mask_test, label_ids_test)

# 训练参数
train_batch_size = 64
num_train_epochs = 3
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
t_total = len(train_dataloader) // num_train_epochs

# 优化器和学习率调度器
learning_rate = 5e-5
adam_epsilon = 1e-8
warmup_steps = 0
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

# 设备
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device=torch.device('cpu')

# 训练模型
model.train()
train_iterator = trange(num_train_epochs, desc="Epoch")
for epoch in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
        model.zero_grad()
        model.to(device)
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        loss = model(**inputs)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

# 保存模型
model.save_pretrained('My_Model/weibo-bert-bilstm-crf-model')

# 评估模型
test_batch_size = 64
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=test_batch_size)

model.eval()
preds = None
out_label_ids = None

for batch in tqdm(test_dataloader, desc="评估中"):
    model.to(device)
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        if preds is None:
            preds = outputs
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, outputs, axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

preds = np.argmax(preds, axis=1)
acc_score = accuracy_score(preds, out_label_ids)
f1_score = f1_score(preds, out_label_ids)
print('测试集中的Accuracy分数: ', acc_score)
print('测试集中的F1分数: ', f1_score)


INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at model/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

: 

### 预测

In [21]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from tqdm import tqdm

# 读取数据
df_origin = pd.read_csv(r'D:/Code/Python/senetiment/data/上海2019_2023年按月分类文件/20191201_20200101 57356 条.csv')
# 只取三行数据
df_origin = df_origin.head(100)
df_origin['label'] = 0  # 统一初始化为0

# 对文本进行清洗
df_origin['text'] = df_origin.content.str.replace('\n', ' ')

def clean_content(content, place):
    import re
    # 删除地名
    content = content.replace(place, '')
    # 删除特殊字符
    content = re.sub(r'分享图片|分享视频|微博视频|的微博视频|#|，|,|。|；|、|！|\+|=|-|&|:|%|\?', '', content)
    content = re.sub(r'[："\'/|……]', '', content)
    # 去除数字和空格
    content = re.sub(r'\d+', '', content).replace(' ', '').replace('.', '')
    # 去除一些符号
    content = re.sub(r'[_\[\]【】<>《》（）\(\)]', '', content)
    # 去掉两个#及中间的字符
    # content = re.sub(r'#.*#', '', content)
    # 把a到z的字母去掉
    content = re.sub(r'[a-zA-Z]+', '', content)
    # 只保留中文
    # content = ''.join(filter(lambda x: '\u4e00' <= x <= '\u9fa5', content))
    return content

# 对content列进行清洗 由对应的地名在文本中去除content文本中的内容
df_origin['text'] = df_origin.apply(lambda row: clean_content(row['text'], row['content_location_name']), axis=1)

# 如果文本为空 去掉该行
df_origin = df_origin[df_origin['text'] != '']

# 保留text文本长度大于4的行
df_origin = df_origin[df_origin['text'].str.len() > 4]


print(df_origin)

X_pred = df_origin['text']
Y_pred = df_origin['label']

# 加载BERT模型和分词器
model_name = 'bert-base-chinese'
# tokenizer = BertTokenizer.from_pretrained('My_Model/weibo-bert-rubbish-model')
# model = BertForSequenceClassification.from_pretrained('My_Model/weibo-bert-rubbish-model')
model = BertForSequenceClassification.from_pretrained('model/' + model_name)
tokenizer = BertTokenizer.from_pretrained('model/' + model_name)

# 定义分词函数
def get_tokens(text, tokenizer, max_seq_length, add_special_tokens=True):
    input_ids = tokenizer.encode(text, add_special_tokens=add_special_tokens, truncation=True, max_length=max_seq_length, pad_to_max_length=True)
    attention_mask = [int(id > 0) for id in input_ids]
    assert len(input_ids) == max_seq_length
    assert len(attention_mask) == max_seq_length
    return (input_ids, attention_mask)

X_pred_tokens = X_pred.apply(get_tokens, args=(tokenizer, 150))

input_ids_pred = torch.tensor([features[0] for features in X_pred_tokens.values], dtype=torch.long)
input_mask_pred = torch.tensor([features[1] for features in X_pred_tokens.values], dtype=torch.long)
label_pred = torch.tensor(Y_pred.values, dtype=torch.long)
pred_dataset = TensorDataset(input_ids_pred, input_mask_pred, label_pred)

pred_batch_size = 256
pred_sampler = SequentialSampler(pred_dataset)
pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=pred_batch_size)

# 预测
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

preds = None
for batch in tqdm(pred_dataloader, desc="Predict"):
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        outputs = model(**inputs)
        _, logits = outputs[:2]

        if preds is None:
            preds = logits.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)

prob = torch.nn.functional.softmax(torch.tensor(preds), dim=1)  # 使用softmax函数计算预测的概率分布
preds = np.argmax(preds, axis=1)  # 计算每个样本的最终预测类别
df_origin['ad_prob'] = [p[1].item() for p in prob]  # 将概率分布的第二列（表示"1"类别的概率）添加到DataFrame中
df_origin['pred'] = preds  # 将最终的预测类别添加到DataFrame中

# print(df_origin.sample(10))


             mid        lat         lng  user_name  \
0   4.444478e+15  31.185350  121.434467      妖洞洞柒柒   
1   4.444478e+15  31.408190  121.252110   牛哄哄-0902   
2   4.444478e+15  31.239872  121.309017      空想家拥有   
3   4.444478e+15  31.274380  121.467290    BeingIt   
4   4.444478e+15  31.210000  121.561800       燕南畿东   
..           ...        ...         ...        ...   
95  4.444488e+15  31.128221  121.447719    隔着机窗看明月   
96  4.444488e+15  31.229090  121.458860        青子呐   
97  4.444488e+15  31.203160  121.474480  混在郊区的围观群众   
98  4.444488e+15  31.317290  121.454300       明渡政七   
99  4.444488e+15  31.189467  121.703424    苏苏的清浅时光   

                         user_link verify_typ  \
0   https://weibo.com/u/1890269977       没有认证   
1   https://weibo.com/u/6589039527       没有认证   
2   https://weibo.com/u/3799023812       没有认证   
3   https://weibo.com/u/1733443685       黄V认证   
4      https://weibo.com/dspxiaoyu       没有认证   
..                             ...        ...   
95  http

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at model/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predict: 100%|██████████| 1/1 [00:16<00:00, 16.95s/it]


In [22]:
df_origin.to_csv('sentiment_analysis_data.csv', index=False,encoding='utf-8-sig')

### Keras下的LSTM

In [None]:
import pickle
import numpy as np
import pandas as pd
from keras.utils import np_utils
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense, Embedding,Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# load dataset
# ['evaluation'] is feature, ['label'] is label
def load_data(filepath,input_shape=20):
    df=pd.read_csv(filepath,encoding='gbk')

    # 标签及词汇表
    labels,vocabulary=list(df['label'].unique()),list(df['evaluation'].unique())

    # 构造字符级别的特征
    string=''
    for word in vocabulary:
        string+=word

    vocabulary=set(string)

    # 字典列表
    word_dictionary={word:i+1 for i,word in enumerate(vocabulary)}
    with open('word_dict.pk','wb') as f:
        pickle.dump(word_dictionary,f)
    inverse_word_dictionary={i+1:word for i,word in enumerate(vocabulary)}
    label_dictionary={label:i for i,label in enumerate(labels)}
    with open('label_dict.pk','wb') as f:
        pickle.dump(label_dictionary,f)
    output_dictionary={i:labels for i,labels in enumerate(labels)}

    # 词汇表大小
    vocab_size=len(word_dictionary.keys())
    # 标签类别数量
    label_size=len(label_dictionary.keys())

    # 序列填充，按input_shape填充，长度不足的按0补充
    x=[[word_dictionary[word] for word in sent] for sent in df['evaluation']]
    x=pad_sequences(maxlen=input_shape,sequences=x,padding='post',value=0)
    y=[[label_dictionary[sent]] for sent in df['label']]
    '''
    np_utils.to_categorical用于将标签转化为形如(nb_samples, nb_classes)
    的二值序列。
    假设num_classes = 10。
    如将[1, 2, 3,……4]转化成：
    [[0, 1, 0, 0, 0, 0, 0, 0]
     [0, 0, 1, 0, 0, 0, 0, 0]
     [0, 0, 0, 1, 0, 0, 0, 0]
    ……
    [0, 0, 0, 0, 1, 0, 0, 0]]
    '''
    y=[np_utils.to_categorical(label,num_classes=label_size) for label in y]
    y=np.array([list(_[0]) for _ in y])

    return x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary

# 创建深度学习模型，Embedding + LSTM + Softmax
def create_LSTM(n_units,input_shape,output_dim,filepath):
    x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary=load_data(filepath)
    model=Sequential()
    model.add(Embedding(input_dim=vocab_size+1,output_dim=output_dim,
                        input_length=input_shape,mask_zero=True))
    model.add(LSTM(n_units,input_shape=(x.shape[0],x.shape[1])))
    model.add(Dropout(0.2))
    model.add(Dense(label_size,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    plot_model(model,to_file='./model_lstm.png',show_shapes=True)
    # 输出模型信息
    model.summary()

    return model

# 模型训练
def model_train(input_shape,filepath,model_save_path):
    # 将数据集分为训练集和测试集，占比为9：1
    # input_shape=100
    x,y,output_dictionary,vocab_size,label_size,inverse_word_dictionary=load_data(filepath,input_shape)
    train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.1,random_state=42)

    # 模型输入参数，需要根据自己需要调整
    n_units=100
    batch_size=32
    epochs=5
    output_dim=20

    # 模型训练
    lstm_model=create_LSTM(n_units,input_shape,output_dim,filepath)
    lstm_model.fit(train_x,train_y,epochs=epochs,batch_size=batch_size,verbose=1)

    # 模型保存
    lstm_model.save(model_save_path)

    # 测试条数
    N= test_x.shape[0]
    predict=[]
    label=[]
    for start,end in zip(range(0,N,1),range(1,N+1,1)):
        print(f'start:{start}, end:{end}')
        sentence=[inverse_word_dictionary[i] for i in test_x[start] if i!=0]
        y_predict=lstm_model.predict(test_x[start:end])
        print('y_predict:',y_predict)
        label_predict=output_dictionary[np.argmax(y_predict[0])]
        label_true=output_dictionary[np.argmax(test_y[start:end])]
        print(f'label_predict:{label_predict}, label_true:{label_true}')
        # 输出预测结果
        print(''.join(sentence),label_true,label_predict)
        predict.append(label_predict)
        label.append(label_true)

    # 预测准确率
    acc=accuracy_score(predict,label)
    print('模型在测试集上的准确率:%s'%acc)

if __name__=='__main__':
    filepath='C:\\数据集\\情感分析60000\\all.csv'
    input_shape=180
    model_save_path='C:\\数据集\\情感分析60000\\corpus_model.h5'
    model_train(input_shape,filepath,model_save_path)

