# 载入数据

In [1]:
# 定义两个list分别存放两个板块的帖子数据
academy_titles = []
job_titles = []
with open('../Chapter 05/academy_titles.txt', encoding='utf8') as f:
    for l in f:  # 按行读取文件
        academy_titles.append(l.strip())  # strip 方法用于去掉行尾空格
with open('../Chapter 05/job_titles.txt', encoding='utf8') as f:
    for l in f:  # 按行读取文件
        job_titles.append(l.strip())  # strip 方法用于去掉行尾空格

In [2]:
data_list = []
for title in academy_titles:
    data_list.append([title, 0])

for title in job_titles:
    data_list.append([title, 1])

In [3]:
max_length = 0
for case in data_list:
    max_length = max(max_length, len(case[0])+2)
print(max_length)

77


# 划分数据

In [4]:
from sklearn.model_selection import train_test_split
train_list, dev_list = train_test_split(data_list,test_size=0.3,random_state=15,shuffle=True)

In [5]:
import os
import time
import random
import torch
import torch.nn.functional as F
from torch import nn
from tqdm import tqdm
import random

from transformers import get_linear_schedule_with_warmup, AdamW
from transformers import BertTokenizer, BertForSequenceClassification

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
max_train_epochs = 6
warmup_proportion = 0.05
gradient_accumulation_steps = 2
train_batch_size = 16
valid_batch_size = train_batch_size
test_batch_size = train_batch_size
data_workers= 2

learning_rate=1e-6
weight_decay=0.01
max_grad_norm=1.0
cur_time = time.strftime("%Y-%m-%d_%H:%M:%S")

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
class MyDataSet(torch.utils.data.Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        example = self.examples[index]
        title = example[0]
        label = example[1]
        r = tokenizer.encode_plus(title, max_length=max_length, padding="max_length")
        return title, label, index#, r['token_type_ids'], label, index

def the_collate_fn(batch):
    r = tokenizer([b[0] for b in batch], padding=True)
    input_ids = torch.LongTensor(r['input_ids'])
    attention_mask = torch.LongTensor(r['attention_mask'])
    label = torch.LongTensor([b[1] for b in batch])
    indexs = [b[2] for b in batch]
    return input_ids, attention_mask, label, indexs #, token_type_ids

train_dataset = MyDataSet(train_list)
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle = True,
    num_workers=data_workers,
    collate_fn=the_collate_fn,
)

dev_dataset = MyDataSet(dev_list)
dev_data_loader = torch.utils.data.DataLoader(
    dev_dataset,
    batch_size=train_batch_size,
    shuffle = False,
    num_workers=data_workers,
    collate_fn=the_collate_fn,
)

# 计算 Accuracy

In [7]:
def get_score():
    y_true = []
    y_pred = []
    for step, batch in enumerate(tqdm(dev_data_loader)):
        model.eval()
        with torch.no_grad():
            input_ids, attention_mask = (b.to(device) for b in batch[:2])
        y_true += batch[2].numpy().tolist()
        logist = model(input_ids, attention_mask)[0]
        result = torch.argmax(logist, 1).cpu().numpy().tolist()
        y_pred += result
    correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            correct += 1
    accuracy = correct / len(y_pred)
    
    return accuracy

# 定义模型

In [8]:
model = BertForSequenceClassification.from_pretrained('bert-base-chinese')
model.to(device)

t_total = len(train_data_loader) // gradient_accumulation_steps * max_train_epochs + 1
num_warmup_steps = int(warmup_proportion * t_total)
print('warmup steps : %d' % num_warmup_steps)
no_decay = ['bias', 'LayerNorm.weight'] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
    {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': weight_decay},
    {'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

warmup steps : 46


In [9]:
def print_test(title):
    r = tokenizer([title])
    input_ids = torch.LongTensor(r['input_ids']).to(device)
    attention_mask = torch.LongTensor(r['attention_mask']).to(device)
    logist = model(input_ids, attention_mask)[0]
    result = torch.argmax(logist, 1).cpu().numpy().tolist()[0]
    result = ['考研考博', '招聘信息'][result]
    print(title, result)
def print_cases():
    print_test('考研心得')
    print_test('北大实验室博士')
    print_test('考外校博士')
    print_test('北大实验室招博士')
    print_test('工作or考研?')
    print_test('急求自然语言处理工程师')
    print_test('校招offer比较')

# 训练模型

In [10]:
for epoch in range(max_train_epochs):
    b_time = time.time()
    model.train()
    for step, batch in enumerate(tqdm(train_data_loader)):
        input_ids, attention_mask, label = (b.to(device) for b in batch[:-1])
        loss = model(input_ids, attention_mask, labels=label)
        loss = loss[0]
        loss.backward()
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
    print('Epoch = %d Epoch Mean Loss %.4f Time %.2f min' % (epoch+1, loss.item(), (time.time() - b_time)/60))
    print(get_score())
    print_cases()

100%|██████████| 311/311 [00:41<00:00,  7.52it/s]
  0%|          | 0/134 [00:00<?, ?it/s]

Epoch = 1 Epoch Mean Loss 0.0391 Time 0.69 min


100%|██████████| 134/134 [00:04<00:00, 26.92it/s]
  0%|          | 0/311 [00:00<?, ?it/s]

0.9901547116736991
考研心得 考研考博
北大实验室博士 考研考博
考外校博士 考研考博
北大实验室招博士 招聘信息
工作or考研? 考研考博
急求自然语言处理工程师 招聘信息
校招offer比较 招聘信息


100%|██████████| 311/311 [00:41<00:00,  7.55it/s]
  0%|          | 0/134 [00:00<?, ?it/s]

Epoch = 2 Epoch Mean Loss 0.0194 Time 0.69 min


100%|██████████| 134/134 [00:05<00:00, 26.79it/s]
  0%|          | 0/311 [00:00<?, ?it/s]

0.9971870604781997
考研心得 考研考博
北大实验室博士 考研考博
考外校博士 考研考博
北大实验室招博士 招聘信息
工作or考研? 考研考博
急求自然语言处理工程师 招聘信息
校招offer比较 招聘信息


100%|██████████| 311/311 [00:41<00:00,  7.55it/s]
  0%|          | 0/134 [00:00<?, ?it/s]

Epoch = 3 Epoch Mean Loss 0.0400 Time 0.69 min


100%|██████████| 134/134 [00:05<00:00, 26.79it/s]
  0%|          | 0/311 [00:00<?, ?it/s]

0.9981247069854665
考研心得 考研考博
北大实验室博士 考研考博
考外校博士 考研考博
北大实验室招博士 招聘信息
工作or考研? 考研考博
急求自然语言处理工程师 招聘信息
校招offer比较 招聘信息


100%|██████████| 311/311 [00:41<00:00,  7.52it/s]
  0%|          | 0/134 [00:00<?, ?it/s]

Epoch = 4 Epoch Mean Loss 0.0089 Time 0.69 min


100%|██████████| 134/134 [00:05<00:00, 26.74it/s]
  0%|          | 0/311 [00:00<?, ?it/s]

0.9985935302390999
考研心得 考研考博
北大实验室博士 考研考博
考外校博士 考研考博
北大实验室招博士 招聘信息
工作or考研? 招聘信息
急求自然语言处理工程师 招聘信息
校招offer比较 招聘信息


100%|██████████| 311/311 [00:41<00:00,  7.55it/s]
  0%|          | 0/134 [00:00<?, ?it/s]

Epoch = 5 Epoch Mean Loss 0.0051 Time 0.69 min


100%|██████████| 134/134 [00:05<00:00, 26.79it/s]
  0%|          | 0/311 [00:00<?, ?it/s]

0.9985935302390999
考研心得 考研考博
北大实验室博士 考研考博
考外校博士 考研考博
北大实验室招博士 招聘信息
工作or考研? 招聘信息
急求自然语言处理工程师 招聘信息
校招offer比较 招聘信息


100%|██████████| 311/311 [00:41<00:00,  7.54it/s]
  0%|          | 0/134 [00:00<?, ?it/s]

Epoch = 6 Epoch Mean Loss 0.0085 Time 0.69 min


100%|██████████| 134/134 [00:05<00:00, 26.73it/s]

0.9985935302390999
考研心得 考研考博
北大实验室博士 考研考博
考外校博士 考研考博
北大实验室招博士 招聘信息
工作or考研? 招聘信息
急求自然语言处理工程师 招聘信息
校招offer比较 招聘信息



