In [None]:
# 定义两个list分别存放两个板块的帖子数据
import jieba
academy_titles = []
job_titles = []
with open('academy_titles.txt', encoding='utf8') as f:
    for l in f:  # 按行读取文件
        academy_titles.append(list(jieba.cut(l.strip( ))))  # strip 方法用于去掉行尾空格
with open('job_titles.txt', encoding='utf8') as f:
    for l in f:  # 按行读取文件
        job_titles.append(list(jieba.cut(l.strip( ))))  # strip 方法用于去掉行尾空格

In [None]:
academy_titles[2]

In [None]:
word_set = set()
for title in academy_titles:
    for word in title:
        word_set.add(word)
for title in job_titles:
    for word in title:
        word_set.add(word)
print(len(word_set))

In [None]:
# char_set = set()
# for title in academy_titles:
#     for ch in title:
#         char_set.add(ch)
# for title in job_titles:
#     for ch in title:
#         char_set.add(ch)
# print(len(char_set))

In [None]:
# import json
# with open('word_list', 'w') as f:
#     json.dump(word_list, f)


In [None]:
import torch
word_list = list(word_set)
n_chars = len(word_set) + 1 # 加一个 UNK

def title_to_tensor(title):
    tensor = torch.zeros(len(title), dtype=torch.long)
    for li,word in enumerate(title):
        try:
            ind = word_list.index(word)
        except ValueError:
            ind = n_chars - 1
        tensor[li] = ind
    return tensor

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, word_count, embedding_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(word_count, embedding_size)
        self.i2h = nn.Linear(embedding_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(embedding_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_tensor, hidden):
        word_vector = self.embedding(input_tensor)
        combined = torch.cat((word_vector, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

def run_rnn(rnn, input_tensor):
    hidden = rnn.initHidden()
    for i in range(input_tensor.size()[0]):
        output, hidden = rnn.forward(input_tensor[i].unsqueeze(dim=0), hidden)
    return output

def train(rnn, criterion, input_tensor, category_tensor):
    rnn.zero_grad()
    output = run_rnn(rnn, input_tensor)
    loss = criterion(output, category_tensor)
    loss.backward()

    # 根据梯度更新模型的参数
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [None]:
# embedding_size = 100
# n_hidden = 128
# n_categories = 2
# rnn = RNN(n_chars, embedding_size, n_hidden, n_categories)
#
# input_tensor = title_to_tensor(academy_titles[0])
# print('input_tensor:\n', input_tensor)
#
# hidden = rnn.initHidden()
# output, hidden = rnn.forward(input_tensor[0].unsqueeze(dim=0), hidden)
# print('output:\n', output)
# print('hidden:\n', hidden)
# print('size of hidden:\n', hidden.size())

In [None]:
import random

all_data = []
categories = ["考研考博", "招聘信息"]

for l in academy_titles:
    all_data.append((title_to_tensor(l), torch.tensor([0], dtype=torch.long)))
for l in job_titles:
    all_data.append((title_to_tensor(l), torch.tensor([1], dtype=torch.long)))

random.shuffle(all_data)
data_len = len(all_data)
split_ratio = 0.7
train_data = all_data[:int(data_len*split_ratio)]
test_data = all_data[int(data_len*split_ratio):]
print("Train data size: ", len(train_data))
print("Test data size: ", len(test_data))

In [None]:
def evaluate(rnn, input_tensor):
    with torch.no_grad():
        rnn.initHidden()
        output = run_rnn(rnn, input_tensor)
        return output

In [None]:
from tqdm import tqdm
epoch = 1
embedding_size = 200
n_hidden = 10
n_categories = 2
learning_rate = 0.005
rnn = RNN(n_chars, embedding_size, n_hidden, n_categories)
# rnn.train()
criterion = nn.NLLLoss()
loss_sum = 0
all_losses = []
plot_every = 100
for e in range(epoch):
    for ind, (title_tensor, label) in enumerate(tqdm(train_data)):
        output, loss = train(rnn, criterion, title_tensor, label)
        loss_sum += loss
        if ind % plot_every == 0:
            all_losses.append(loss_sum / plot_every)
            loss_sum = 0
    c = 0
    for title, category in tqdm(test_data):
        output = evaluate(rnn, title)
        topn, topi = output.topk(1)
        if topi.item() == category[0].item():
            c += 1
    print('accuracy', c / len(test_data))

In [None]:
c = 0
l1 = []
l2 = []
for title, category in tqdm(test_data):
    output = evaluate(rnn, title)
    topn, topi = output.topk(1)
    l1.append(topi.item())
    l2.append(category[0].item())
    if topi.item() == category[0].item():
        c += 1
print('accuracy', c / len(test_data))

In [None]:
print(l1[:40])
print(l2[:40])

In [None]:
sum(l1)

In [None]:
sum(l2)

In [None]:
c = 0
for title, category in tqdm(test_data):
    output = evaluate(rnn, title)
    topn, topi = output.topk(1)
    if topi.item() == category[0].item():
        c += 1
print('accuracy', c / len(test_data))

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure(figsize=(10,7))
plt.plot(all_losses[1:])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,7))

plt.ylabel('Average Loss')
plt.plot(all_losses[1:])

In [None]:
torch.save(rnn, 'rnn_model.pkl')

In [None]:
def get_category(rnn, title):
    input_tensor = title_to_tensor(title)
    with torch.no_grad():
        rnn.initHidden()
        output = run_rnn(rnn, input_tensor)
        topv, topi = output.topk(1)
        return categories[topi.item()]

In [None]:
input_tensor = title_to_tensor("北大实验室招硕博连读保研学生")

In [None]:
input_tensor

In [None]:
o = evaluate(rnn, input_tensor)

In [None]:
o.topk(1)

In [None]:
categories

In [None]:
def get_category(title):
    title = title_to_tensor(title)
    output = evaluate(rnn, title)
    topn, topi = output.topk(1)
    return categories[topi.item()]

In [None]:
# def get_category(title):
#     title = title_to_tensor(title)
#     output = evaluate(rnn, title)
#     topn, topi = output.topk(1)
#     return categories[topi.item()]
# while True:
#     title = input()
#     if not title:
#         break
#     print(categories)

In [None]:
print("【校招】今日头条后端开发工程师", get_category(rnn, "【校招】今日头条后端开发工程师"))

In [None]:
print("毕业找房子", get_category(rnn, "毕业找房子"))

In [None]:

print("学校附近的公寓", get_category(rnn, "学校附近的公寓"))
print("学校附近的公寓", get_category(rnn, "学校附近的公寓"))
print("考博经验帖", get_category(rnn, "考博经验帖"))
print("2021年秋季出国交流", get_category(rnn, "2021年秋季出国交流"))
print("考研学校选择，纠结，求师哥师姐指导", get_category(rnn, "考研学校选择，纠结，求师哥师姐指导"))