## Distinguish fake & real news 真假新闻判别
- 对数据执行了一些分析
- 进行了一部分特征提取工程，并利用一些传统SML方法尝试进行分类
- 构建一个RNN网络，利用文本内容进行分类
- 载入transformers的预训练Bert，尝试完成分类任务

In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
real_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

## 数据整理
- 去除real news的文本里特有的前缀（报道来源），只针对新闻本身报道的内容文本进行分析

In [None]:
import re 
def removePrefix(text):
    pattern = r"^([A-Z]).*?-\s"
    text = re.sub(pattern, '', text)
    return text

real_news.text = real_news.text.apply(lambda x : removePrefix(x))
real_news.text[1]

- 为数据添加标记维度
- 去除部分空新闻（内容为空白但不为Null）
- 去除部分重复出现的新闻

In [None]:
real_news['valid'] = 1
fake_news['valid'] = 0

news_source = pd.concat([real_news, fake_news], axis=0)
news_source = news_source[news_source[['text', 'title', 'date']].duplicated() == False]
news_source.text = news_source.text.apply(lambda x: np.nan if len(x.strip()) < 1 else x)
news_source = news_source.dropna()
news_source.drop(columns=['subject'], inplace=True)
news_source.duplicated().sum()

In [None]:
news_source.to_csv('./Source.csv')

## 特征提取
- 利用Rattle与pandas等工具对数据的一些分布进行观察后，尝试提取出一些有用的信息（较主观/直觉）

In [None]:
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re
import nltk
import string
import pandas as pd

news_source = pd.read_csv('./Source.csv')
stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)

- 获取真假新闻的标题/文本中前十的高频词
- 将任意一条新闻的标题/文本内容中出现此两类高频词的次数当做特征，分别进行统计

In [None]:
def token_freq(df, feature, valid):
    tar_texts = df[df.valid == valid][feature].values
    texts = ' '.join(tar_texts).lower()
    tokens = ''.join(char for char in texts if char not in punctuations).split()
    tokens_cleaned = [word for word in tokens if word not in stop_words]
    return pd.DataFrame(nltk.FreqDist(tokens_cleaned).most_common(10))[0]

real_title_freq = token_freq(news_source, 'title', 1)
real_text_freq = token_freq(news_source, 'text', 1)
fake_title_freq = token_freq(news_source, 'title', 0)
fake_text_freq = token_freq(news_source, 'text', 0)

In [None]:
def count_freq_token(text, freq_df):
    text = text.lower()
    tokens = ''.join(char for char in text if char not in punctuations).split()
    count = 0
    for token in tokens:
        if token in freq_df.values:
            count += 1
    return count

- 即：标题中出现的真/假新闻标题高频词数目，文本中出现的真/假新闻文本高频词数目

In [None]:
news_source['fake_title_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, fake_title_freq))
news_source['read_title_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, real_title_freq))
news_source['fake_text_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, fake_text_freq))
news_source['read_text_token_freq'] = news_source.title.apply(lambda x : count_freq_token(x, real_text_freq))

- 查看部分标点的运用，此处选择了'？'与'！'两个通常带有浓烈感情色彩和引导性的符号

In [None]:
def countPunctuation(text):
    ques = re.subn(r"\?", "", text)[1]
    exclam = re.subn(r"\!", "", text)[1]
    return ques, exclam

In [None]:
news_source[['title_ques_num', 'title_exclam_num']] = news_source.title.apply(lambda x : pd.Series(countPunctuation(x)))
news_source[['text_ques_num', 'text_exclam_num']] = news_source.text.apply(lambda x : pd.Series(countPunctuation(x)))

- 标题长度与标题长度占正文长度比，不过要注意如果新闻文本内容本身不完整，此项会受到较大影响

In [None]:
news_source['title_len'] = news_source.title.apply(lambda x : len(x))
news_source['title_ratio'] = news_source.text.apply(lambda x : len(x))
news_source['title_ratio'] = news_source['title_len'] / news_source['title_ratio']

In [None]:
news_source.to_csv('./NewsAna.csv')

- 查看提取出的各个特征与其类别的关系，是否存在一定程度的正/负相关

In [None]:
news_source.corr().valid

In [None]:
news_source.drop(columns=['Unnamed: 0', 'title', 'text', 'date'], inplace=True)

In [None]:
news_source.reset_index()
news_source = news_source.sample(frac=1.)

## 一些常见SML模型预测
- 随机分为train与test集，利用提取出的特征进行分类
- 采用RandomForest，DecisionTree和LinearSVC三个模型

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

train_set, test_set = train_test_split(news_source, test_size=0.2, random_state=7)
train_x = train_set[['fake_title_token_freq', 'read_title_token_freq', 'fake_text_token_freq', 
                 'read_text_token_freq', 'title_ques_num', 'title_exclam_num', 'text_ques_num', 
                 'text_exclam_num', 'title_len', 'title_ratio']]
train_y = train_set['valid']
test_x = test_set[['fake_title_token_freq', 'read_title_token_freq', 'fake_text_token_freq', 
                 'read_text_token_freq', 'title_ques_num', 'title_exclam_num', 'text_ques_num', 
                 'text_exclam_num', 'title_len', 'title_ratio']]
test_y = test_set['valid']

In [None]:
def model_report(model, tar_x, tar_y):
    pred = model.predict(tar_x)
    f1 = f1_score(tar_y, pred)
    print("f1-score: ", f1)
    acc = accuracy_score(tar_y, pred)
    print("accuracy: ", acc)
    cm = confusion_matrix(tar_y, pred)
    print("confusion matrix:\n",cm)

In [None]:
randomForest = RandomForestClassifier(random_state=7)
randomForest.fit(train_x, train_y)

In [None]:
print("RF on training set:")
model_report(randomForest, train_x, train_y)
print("\nRF on testing set:")
model_report(randomForest, test_x, test_y)

In [None]:
from sklearn.tree import DecisionTreeClassifier

decisionTree = DecisionTreeClassifier()
decisionTree.fit(train_x, train_y)

In [None]:
print("tree on training set:")
model_report(decisionTree, train_x, train_y)
print("\ntree on testing set:")
model_report(decisionTree, test_x, test_y)

In [None]:
from sklearn.svm import LinearSVC

linearSVC = LinearSVC(max_iter=5000, penalty='l2')
linearSVC.fit(train_x, train_y)

In [None]:
print("LinearSVC on training set:")
model_report(linearSVC, train_x, train_y)
print("\nLinearSVC on testing set:")
model_report(linearSVC, test_x, test_y)

## 预训练Bert模型
- 采用huggingface的预训练模型bert-based-uncased
- Tokenizer也是他们的

In [None]:
from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler
import torch

batch_size = 64
epoch_num = 4
max_seq_length = 128

news_source = pd.read_csv('./NewsAna.csv')
news_source.reset_index()
news_source = news_source.sample(frac=1.)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model_path = "bert-base-uncased"

In [None]:
class NewsModel(torch.nn.Module):
    def __init__(self, bert_model=bert_model_path, num_class=1):
        super(NewsModel, self).__init__()
        # 加载预训练模型(from huggingface)
        self.bert_layer = AutoModel.from_pretrained(pretrained_model_name_or_path=bert_model)
        # 或许可以补入一点特征数据，来求取最终结果
        self.bert_config = AutoConfig.from_pretrained(pretrained_model_name_or_path=bert_model)
        self.mid_dim = self.bert_config.hidden_size
        # 进行最终分类
        self.output = torch.nn.Sequential(
            torch.nn.Linear(self.mid_dim, self.mid_dim//2),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(self.mid_dim//2, self.mid_dim),
            torch.nn.LeakyReLU(),
            torch.nn.Linear(self.mid_dim, num_class),
            torch.nn.Sigmoid()
        )
    def forward(self, input_ids, attn_mask=None):
        bert_out = self.bert_layer(input_ids=input_ids, attention_mask=attn_mask)[1]
        output = self.output(bert_out)
        return output

- 利用新闻的text文本内容进行分析
- 最大长度限制在128

In [None]:
def covertTokenFormat(df, bert_model_path, max_seq_len):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=bert_model_path)
    texts = df.text.tolist()
    text_tokens = tokenizer(texts, padding='max_length', max_length=max_seq_len, truncation=True, return_tensors="pt")
    labels = torch.tensor(df.valid.values, dtype=torch.float)
    return text_tokens, labels

In [None]:
def news_bert_report(pred, label):
    tar_y = label.squeeze()
    pred_y = []
    for item in pred.squeeze():
        if item >= 0.5:
            pred_y.append(1)
        else:
            pred_y.append(0)
    f1 = f1_score(tar_y, pred_y)
    print("f1-score: ", f1)
    acc = accuracy_score(tar_y, pred_y)
    print("accuracy: ", acc)
    return f1, acc

In [None]:
# create model
news_model = NewsModel(bert_model=bert_model_path).to(device)
optimiser = AdamW(news_model.parameters(), lr=1e-5)
# parepare the train, test set
train_set, test_set = train_test_split(news_source, test_size=0.2, random_state=7)
# dataloader - training set
text_tokens, labels = covertTokenFormat(train_set, bert_model_path, max_seq_length)
train_data = TensorDataset(text_tokens.input_ids, text_tokens.attention_mask, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

- 在训练中，将每个epoch后的模型进行保存

In [None]:
from torch.nn import functional as F

def train_bert(bert, optimiser, train_dataloader, save_check_point=True):
    bert = bert.to(device)
    bert.train()
    for epoch in range(epoch_num): 
        epoch_loss = 0
        pred_lis = torch.Tensor()
        label_lis = torch.Tensor()
        for batch, (token_ids, attn_mask, label) in enumerate(train_dataloader):
            # keep all the parameters in the same device
            token_ids = token_ids.to(device)
            attn_mask = attn_mask.to(device)
            label = label.to(device)
            # the output will be in the same device with the model
            outputs = bert(token_ids, attn_mask)
            loss = F.binary_cross_entropy(outputs.squeeze(), label)
            # do the backprop and update the parameters
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            epoch_loss += loss.cpu().data.numpy()
            pred_lis = torch.cat([pred_lis, outputs.cpu().squeeze()])
            label_lis = torch.cat([label_lis, label.cpu().squeeze()])
            if batch % 50 == 0:
                print("Current batch loss :", loss.cpu().data.numpy())
        print("Now epoch :", epoch+1, " Total epoch loss is: ", epoch_loss)
        news_bert_report(pred_lis.detach().numpy(), label_lis.detach().numpy())
        if save_check_point:
            torch.save({'model_state_dict': bert.state_dict()}, './bert_model_' + str(epoch) + '.weights')
    return bert

In [None]:
news_model = train_bert(
    news_model,
    optimiser,
    train_dataloader
)

- 模型效能评估，注意内存溢出

In [None]:
test_tokens, test_labels = covertTokenFormat(test_set, bert_model_path, max_seq_length)
test_data = TensorDataset(test_tokens.input_ids, test_tokens.attention_mask, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=16)

# close the gradient
with torch.no_grad():
    news_model.eval()
    pred_lis = torch.Tensor()
    label_lis = torch.Tensor()
    for batch, (token_ids, attn_mask, label) in enumerate(test_dataloader):
        token_ids = token_ids.to(device)
        attn_mask = attn_mask.to(device)
        label = label.to(device)
        outputs = news_model(token_ids, attn_mask)
        pred_lis = torch.cat([pred_lis, outputs.cpu().squeeze()])
        label_lis = torch.cat([label_lis, label.cpu().squeeze()])

    news_bert_report(pred_lis.detach().numpy(), label_lis.detach().numpy())

## RNN模型构建（GRU)
- 使用了Glove的词向量进行Embedding
- 进行变长输入的padding，使其能够以Batch为单位放入GPU处理
- 这里使用最终实际位置的hidden output进行句子描述，注意拼接(bidirectional=True时)
- 标明了batch_first=True，不转置输入矩阵的前两维
- 有利用multi-head attention，注意输入的词向量维度要能整除head num

In [None]:
import torch
import string
import torch.nn.utils.rnn as rnn_utils
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

import pandas as pd
news_source = pd.read_csv('./NewsAna.csv')
news_source.reset_index()
news_source = news_source.sample(frac=1.)

stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)
cache_dir = './glove'
glove = GloVe(name='6B', dim=50, cache=cache_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epoch_num = 40
batch_size = 128
lr = .001
input_dim = 50
num_heads = 5
output_dim = 1
gru_num_layers = 2

- 添加一个简单的AttentionBlock层
- 非常简单，没有用Layer Norm什么的

In [None]:
class AttentionBlock(torch.nn.Module):
    def __init__(self, input_dim, attn_heads):
        super(AttentionBlock, self).__init__()
        self.query = torch.nn.Linear(input_dim, input_dim, bias=False)
        self.key = torch.nn.Linear(input_dim, input_dim, bias=False)
        self.value = torch.nn.Linear(input_dim, input_dim, bias=False)
        self.mul_attn = torch.nn.MultiheadAttention(input_dim, attn_heads)
        self.output = torch.nn.Linear(input_dim, input_dim, bias=False)
        
    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)
        attentions = self.mul_attn(query, key, value)[0] + x
        output = self.output(attentions) + x
        return output
        

class NewsGRUModel(torch.nn.Module):
    def __init__(self, input_dim, output_dim, vocab_size=0, num_heads=8, gru_num_layers=1, bidirectional=True, dropout=.1, hidden_layers = [128, 64, 128]):
        super(NewsGRUModel, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = input_dim // 2 if bidirectional else input_dim
        self.output_dim = output_dim
        self.head_num = num_heads
        self.gru_num_layers = gru_num_layers
        self.vocab_size = vocab_size
        self.bidirectional = bidirectional
        # Embedding
        if not self.vocab_size == 0:
            self.embed = torch.nn.Embedding(vocab_size, input_dim)
        # GRUs
        self.gru_layer = torch.nn.GRU(
            input_size=self.input_dim, 
            hidden_size=self.hidden_dim, 
            num_layers=self.gru_num_layers, 
            bidirectional=self.bidirectional, 
            batch_first=True,
            dropout=dropout
        )
        # Add attentions
        self.attention = AttentionBlock(self.input_dim, self.head_num)
        # The FFN to adjust the outputs
        if hidden_layers and not len(hidden_layers) == 0:
            # the dim is not changed through the two GRU layer
            hidden_list = [torch.nn.Linear(self.input_dim, hidden_layers[0])]
            for idx in range(len(hidden_layers) - 1):
                hidden_list.append(torch.nn.Linear(hidden_layers[idx], hidden_layers[idx + 1]))
            self.hidden_layer_list = torch.nn.ModuleList(hidden_list)
            # init the weights
            for layer in self.hidden_layer_list: 
                torch.nn.init.kaiming_normal_(layer.weight.data)
            self.hidden_out_dim = hidden_layers[-1]
        else:
            self.hidden_layer_list = []
            self.hidden_out_dim = self.input_dim
        # Output layer
        self.output = torch.nn.Linear(self.hidden_out_dim, self.output_dim)
        # Other functions
        self.activate = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)
    
    def forward(self, x, x_len, pretrained_embed=False):
        if not (pretrained_embed or self.vocab_size == 0):
            x = self.embed(x)
        # pack padded seq
        x = rnn_utils.pack_padded_sequence(x, x_len, batch_first=True, enforce_sorted=False)
        # GRU layer
        output, hidden_info = self.gru_layer(x)
        # get the final hidden outputs
        if self.bidirectional:
            fin_out = torch.cat([hidden_info[0,:,:], hidden_info[1,:,:]], dim=1).unsqueeze(dim=1)
        else:
            fin_out = hidden_info
        # attentions
        output = self.attention(fin_out)
        # ffn process
        for layer in self.hidden_layer_list:
            output = layer(output)
            output = self.activate(output)
            output = self.dropout(output)
        # output layer, get logits
        output = self.output(output)
        return output

- 忽略停用词和符号，这次使用新闻标题进行分类分析

In [None]:
# 将句子转化为tokens，用了Glove的预训练词向量
def covertTextToGolveVec(df):
    golve_vecs = []
    titles = df.title.values
    for title in titles:
        tokens = word_tokenize(title.lower())
        for token in tokens:
            if token in stop_words or token in punctuations:
                tokens.remove(token)
        golve_vecs.append(glove.get_vecs_by_tokens(tokens))
    return golve_vecs

- 定义collate_fn函数，获取各句子长度并对其进行padding，以放入Tensor之中在GPU进行批处理

In [None]:
from torch.utils.data import Dataset, DataLoader

class NewsDataset(Dataset):
    def __init__(self, train_x, train_y):
        self.train_x = train_x
        self.train_y = train_y
    def __len__(self):
        return len(self.train_y)
    def __getitem__(self, idx):
        idx -= 1
        return self.train_x[idx], self.train_y[idx]
    
def collate_fn(train_data):
    (train_data, train_label) = zip(*train_data)
    data_length = [len(data) for data in train_data]
    train_data = rnn_utils.pad_sequence(train_data, batch_first=True, padding_value=0)
    train_label = torch.Tensor(train_label)
    return train_data, train_label, data_length

In [None]:
learn_set, test_set = train_test_split(news_source, test_size=0.2, random_state=7)
train_set, val_set = train_test_split(learn_set, test_size=0.2, random_state=77)

news_vecs = covertTextToGolveVec(train_set)
label_vecs = train_set.valid.values
train_dataset = NewsDataset(news_vecs, label_vecs)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)

val_news = covertTextToGolveVec(val_set)
val_labels = val_set.valid.values
val_dataset = NewsDataset(val_news, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

news_gru = NewsGRUModel(input_dim=input_dim, output_dim=output_dim, num_heads=num_heads, gru_num_layers=gru_num_layers, bidirectional=True).to(device)
optimiser = torch.optim.Adam(news_gru.parameters(), lr=lr)
loss_func = torch.nn.MSELoss()

In [None]:
def news_gru_report(pred, label):
    tar_y = label.squeeze()
    pred_y = []
    for item in pred.squeeze():
        if item >= 0.5:
            pred_y.append(1)
        else:
            pred_y.append(0)
    f1 = f1_score(tar_y, pred_y)
    acc = accuracy_score(tar_y, pred_y)
    cm = confusion_matrix(tar_y, pred_y)
    return f1, acc, cm

- 会将在Validation set上表现最佳的模型保存

In [None]:
def check_save_model(model, acc, his_acc):
    model_path = './news_gru_best_val.weights'
    flag = True
    for his in his_acc:
        if acc < his:
            flag = False
    if flag:
        torch.save({
            'model_state_dict': model.state_dict()
        }, model_path)

def train_gru(news_gru, optimiser, loss_func, train_dataloader, val_dataloader, save_best_cp=True):
    news_gru = news_gru.to(device)
    news_gru.train()
    train_loss = []
    val_loss = []
    train_acc = []
    val_acc = []
    train_f1 = []
    val_f1 = []
    for epoch in range(epoch_num):
        # now start training
        epoch_loss = 0
        pred_lis = torch.Tensor()
        label_lis = torch.Tensor()
        for batch_idx, (data, label, length) in enumerate(train_dataloader):
            input_vec = data.to(device)
            label = label.to(device)
            pred = news_gru(input_vec, length, True)
            loss = loss_func(pred.squeeze(), label.squeeze())
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            # record the batch ouputs
            epoch_loss += loss.cpu().data.numpy()
            pred_lis = torch.cat([pred_lis, pred.cpu().squeeze()])
            label_lis = torch.cat([label_lis, label.cpu().squeeze()])
        # record the epoch output
        train_loss.append(epoch_loss)
        f1, acc, cm = news_gru_report(pred_lis.detach().numpy(), label_lis.detach().numpy())
        train_acc.append(acc)
        train_f1.append(f1)
        # on the validation set
        epoch_val = 0
        val_pred = torch.Tensor()
        val_label = torch.Tensor()
        # stop the gradient, we not gonna do backprop on validation set
        with torch.no_grad():
            for batch_idx, (data, label, length) in enumerate(val_dataloader):
                input_vec = data.to(device)
                label = label.to(device)
                pred = news_gru(input_vec, length, True)
                loss = loss_func(pred.squeeze(), label.squeeze())
                # record the batch output
                epoch_val += loss.cpu().data.numpy()
                val_pred = torch.cat([val_pred, pred.cpu().squeeze()])
                val_label = torch.cat([val_label, label.cpu().squeeze()])
        # record the epoch output
        val_loss.append(epoch_val)
        vf1, vacc, vcm = news_gru_report(val_pred.detach().numpy(), val_label.detach().numpy())
        val_acc.append(vacc)
        val_f1.append(vf1)
        if save_best_cp and epoch > (epoch_num / 10):
            check_save_model(news_gru, vacc, val_acc)
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print("f1-score on Training set:", f1, ", Validation set:", vf1)
            print("accuracy on Training set:", acc, ", Validation set:", vacc)
            print("confusion matrix (Training set):\n",cm, "\nconfusion matrix (Validation set):\n", vcm)
            print("Current epoch:", epoch + 1, " Total loss:", epoch_loss, "\n")
    torch.save({'model_state_dict': news_gru.state_dict()}, './news_gru_final.weights')
    return news_gru, train_loss, val_loss, train_acc, val_acc, train_f1, val_f1

In [None]:
news_gru, train_loss, val_loss, train_acc, val_acc, train_f1, val_f1 = train_gru(
    news_gru=news_gru, 
    optimiser=optimiser, 
    loss_func=loss_func, 
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader, 
    save_best_cp=True
)

- 将训练过程的变化结果绘出

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(16,5))
sns.lineplot(data=train_loss)
sns.lineplot(data=val_loss)
plt.title("The loss curve on training & validation set")
plt.show()

plt.figure(figsize=(16,5))
sns.lineplot(data=train_acc)
sns.lineplot(data=val_acc)
plt.title("The accuracy curve on training & validation set")
plt.show()

plt.figure(figsize=(16,5))
sns.lineplot(data=train_f1)
sns.lineplot(data=val_f1)
plt.title("The f1 curve on training & validation set")
plt.show()

- 训练结果测试（跑完所有epochs的最终模型），同样注意内存溢出问题

In [None]:
test_news = covertTextToGolveVec(test_set)
test_labels = test_set.valid.values
test_dataset = NewsDataset(test_news, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

with torch.no_grad():
    news_gru = news_gru.to(device)
    news_gru.eval()
    pred_lis = torch.Tensor()
    label_lis = torch.Tensor()
    for batch_idx, (data, label, length) in enumerate(test_dataloader):
        input_vec = data.to(device)
        label = label.to(device)
        pred = news_gru(input_vec, length, True)
        pred_lis = torch.cat([pred_lis, pred.cpu().squeeze()])
        label_lis = torch.cat([label_lis, label.cpu().squeeze()])
    f1, acc, cm = news_gru_report(pred_lis.detach().numpy(), label_lis.detach().numpy())
    print("f1-score on testing set:", f1)
    print("accuracy on testing set:", acc)
    print("confusion matrix (testing set):\n",cm)

- 训练结果测试（Validation set上的最强模型）

In [None]:
model_path = './news_gru_best_val.weights'
checkpoint = torch.load(model_path)
news_gru.load_state_dict(checkpoint['model_state_dict'])

with torch.no_grad():
    news_gru = news_gru.to(device)
    news_gru.eval()
    pred_lis = torch.Tensor()
    label_lis = torch.Tensor()
    for batch_idx, (data, label, length) in enumerate(test_dataloader):
        input_vec = data.to(device)
        label = label.to(device)
        pred = news_gru(input_vec, length, True)
        pred_lis = torch.cat([pred_lis, pred.cpu().squeeze()])
        label_lis = torch.cat([label_lis, label.cpu().squeeze()])
    f1, acc, cm = news_gru_report(pred_lis.detach().numpy(), label_lis.detach().numpy())
    print("f1-score on testing set:", f1)
    print("accuracy on testing set:", acc)
    print("confusion matrix (testing set):\n",cm)