In [1]:
import pandas as pd
import numpy as np
# import nltk
# from nltk import word_tokenize, pos_tag
# from nltk.stem import  WordNetLemmatizer
# from nltk.corpus import stopwords
# from gensim.models import Word2Vec

from tqdm import tqdm
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report

import torch
from torch.utils import data
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import jieba.posseg as pseg
import jieba

from gensim.models import word2vec, Word2Vec
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('dataset_user_job_all_1.csv')

In [3]:
stop_words = [line.strip() for line in open('chinese_stopword.txt',encoding='UTF-8').readlines()]

def pretreatment(comment):

    token_words = jieba.lcut(comment)
    token_words = [w for w in token_words if w not in stop_words]
    token_words =  pseg.cut(' '.join(token_words))
    cleaned_word = []
    for word, tag in token_words:
        if word.isdigit():
            continue
        else:
            cleaned_word.append(word)
    return cleaned_word

In [4]:
segment_job =[]
# job_set=pd.read_csv('job_information.csv')
for content in tqdm(dataset["岗位描述"].values):
#     segment.append(pretreatment(content))
    segment_job.append(list(jieba.cut(content)))
dataset["text_job"] = segment_job
# job_set.to_csv("job_set_segment.csv",index=False)

  0%|          | 0/138238 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.467 seconds.
Prefix dict has been built successfully.
100%|██████████| 138238/138238 [02:20<00:00, 986.82it/s] 


In [5]:
segment_user = []
# job_set=pd.read_csv('job_information.csv')
for content in tqdm(dataset["resume"].values):
#     segment.append(pretreatment(content))
    segment_user.append(list(jieba.cut(content)))
dataset["text_user"] = segment_user
# user_set.to_csv("user_set_segment.csv",index=False)

100%|██████████| 138238/138238 [12:24<00:00, 185.69it/s]


In [25]:
def train_word2vec(x):
    '''
    param: x is a list contain all the words
    return: the trained model
    '''

    model = word2vec.Word2Vec(x, size=200, window=5, min_count=2, workers=8,
                             iter=10, sg=1)
    return model

In [7]:
# w2v_model_1 = train_word2vec(dataset.text_job.values)
# w2v_model_1.save('./word2vec1.model')
w2v_model_1 = Word2Vec.load('./word2vec1.model')

In [8]:
# w2v_model_2 = train_word2vec(dataset.text_user.values)
# w2v_model_2.save('./word2vec2.model')
w2v_model_2 = Word2Vec.load('./word2vec2.model')

In [9]:
class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        '''
        param: sentences: the list of corpus
               sen_len: the max length of each sentence
               w2v_path: the path storing word emnbedding model
        '''

        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []

    def get_w2v_model(self):
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size

    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)

    def make_embedding(self, load=True):
        print("Get embedding ...")
        if load:
            print("loading word2vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        for i, word in enumerate(self.embedding.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding("")
        self.add_embedding("")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix

    def pad_sentence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx[''])
        assert len(sentence) == self.sen_len
        return sentence

    def sentence_word2idx(self):
        '''
        change words in sentences into idx in embedding_matrix
        '''
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx[''])
            sentence_idx = self.pad_sentence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)

    def labels_to_tensor(self, y):
        return torch.LongTensor(y)

In [10]:
# TextCNN
class TextCNN(nn.Module):
    def __init__(self, channels, kernel_size, pool_size, dim, method='max'):
        super(TextCNN, self).__init__()
        self.net1 = nn.Sequential(
            nn.Conv2d(1, channels, kernel_size[0]),
            nn.BatchNorm2d(channels), # 其中的参数是通道数
            nn.ReLU(),
            nn.MaxPool2d(pool_size)
        )
        self.net2 = nn.Sequential(
            nn.Conv2d(channels, channels, kernel_size[1]),
            nn.BatchNorm2d(channels),
            nn.ReLU(),
            nn.AdaptiveMaxPool2d((1, dim)) # （1，dim）是指输出大小
        )
        if method == 'max':
            self.pool = nn.AdaptiveMaxPool2d((1, dim))
        elif method == 'mean':
            self.pool = nn.AdaptiveAvgPool2d((1, dim))
        else:
            raise ValueError('method {} not exist'.format(method))

    def forward(self, x):
        x = self.net1(x)
        x = self.net2(x).squeeze(2)
        x = self.pool(x).squeeze(1)
        return x

In [11]:
# MLP
class MLP(nn.Module):
    def __init__(self, input_size, output_size, dropout):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(input_size, input_size),
            nn.ReLU(),
            nn.Linear(input_size, output_size),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.net(x)
        return x

In [12]:
# PJFNN
class PJFNN(nn.Module):
    def __init__(self, embedding1,embedding2, channels=1, dropout=0.5, fix_embedding=True):
        super(PJFNN, self).__init__()
        self.dim1 = embedding1.size(1)
#         self.user_dim = input_dim
        self.dim2 = embedding2.size(1)
        self.channels = channels
        # job  字典中有(行数)个词，词向量维度为(列数)
        self.embedding1 = nn.Embedding(embedding1.size(0), embedding1.size(1))
        self.embedding1.weight = nn.Parameter(embedding1)
        self.embedding1.weight.requires_grad = False if fix_embedding else True
        # user
        self.embedding2 = nn.Embedding(embedding2.size(0), embedding2.size(1))
        self.embedding2.weight = nn.Parameter(embedding2)
        self.embedding2.weight.requires_grad = False if fix_embedding else True

        self.linear_transform = nn.Linear(200, 64)

        self.geek_layer = TextCNN(
            channels=self.channels,
            kernel_size=[(5, 1), (5, 1)],
            pool_size=(2, 1),
            dim=200,
            method='max'
        )

        self.job_layer = TextCNN(
            channels=self.channels,
            kernel_size=[(5, 1), (5, 1)],
            pool_size=(2, 1),
            dim=200,
            method='mean'
        )

        self.mlp = MLP(
            input_size=128,
            output_size=1,
            dropout=dropout
        )


    def forward(self, job, user):
        job = self.embedding1(job.long()) #.long()
        job = job.unsqueeze(1)
        job = self.job_layer(job)

        user = self.embedding2(user.long()) #.long()
        user = user.unsqueeze(1)
        user = self.geek_layer(user)

        # MLP层
#         user = self.user_layer(user)
        # 全连接层,变成64维
        user = self.linear_transform(user)
        job = self.linear_transform(job)
        # tensor进行拼接
        x = torch.cat((user,job),dim=1)
        # mlp层
        x = self.mlp(x).squeeze(1)
        return x

In [13]:
'''
制作dataset
'''
# 建立了dataset所需要的'__init__', '__getitem__', '__len__'
# 好让dataloader能使用
class JobUserDataset(data.Dataset):
    def __init__(self, job, user, label):
        self.job = job
        self.user = user
        self.label = label

    def __getitem__(self, idx):
        if self.label is None:
            return self.job[idx], self.user[idx]
        return self.job[idx], self.user[idx], self.label[idx]

    def __len__(self):
        return len(self.job)

In [15]:
dataset['text_job']

0         [岗位职责, ：, \n, ·,  , 业务学习, &, 任务, 处理, ：,  , 严格,...
1         [在, 海轮, 上, 工作, 的, 人员, 统称, 海员, 。,  , 海员, 分, 两大类...
2         [岗位职责, ：, \n, 1, .,  , 响应, 用户, IT, 相关, 问题, 请求,...
3         [帮助, 客户, 公司, 进行, 信息化, 咨询, 。, 具体, 是, 通过, SAP,  ...
4                                              [负责, 起草, 文书]
                                ...                        
138233    [岗位职责, ：, \n, 1, 、, 设备, 安装, 、, 调试, 和, 量产, ；, \...
138234    [作为, 零售, 经理, 储备, 梯队, ，, 负责, 店铺, 运营, 、, 人员, 、, ...
138235    [岗位职责, ：, \n, \n, 1, 、, 根据, 基金, 经理, 的, 相关, 要求,...
138236    [（, 1, ）, 从事, 水利水电, 、, 抽水, 蓄能, 、, 地质灾害, 、, 市政,...
138237    [岗位职责, ：, \n, 1, 、, 负责, 室内, 舒适性, 研究, \n, 2, 、,...
Name: text_job, Length: 138238, dtype: object

In [14]:
text = []
for i in dataset['text_job']:
#     print(i)
    temp = str(i[1:-1]).split(',')
    # 删除当前字符串的首尾的空格和换行符
    text.append([t.strip()[1:-1] for t in temp])
dataset['text_job'] = text

In [15]:
text = []
for i in dataset['text_user']:
    temp = str(i[1:-1]).split(',')
    # 删除当前字符串的首尾的空格和换行符
    text.append([t.strip()[1:-1] for t in temp])
dataset['text_user'] = text

In [16]:
# train_x_t = train_dataset['text_job']
# train_user_t = train_dataset['text_user']
# train_y_t = train_dataset['label']
#
# val_x_t = test_dataset['text_job']
# val_user_t = test_dataset['text_user']
# val_y_t = test_dataset['label']
#
# test_x_t = val_dataset['text_job']
# test_user_t = val_dataset['text_user']
# test_y_t = val_dataset['label']

x_t = dataset['text_job']
user_t = dataset['text_user']
y_t = dataset['label']

In [17]:
sen_len_user = 50
preprocess_user = Preprocess(x_t, sen_len_user, w2v_path="./word2vec1.model")
embedding1 = preprocess_user.make_embedding(load=True)
x = preprocess_user.sentence_word2idx()

sen_len = 200
preprocess = Preprocess(user_t, sen_len, w2v_path="./word2vec2.model")
embedding2 = preprocess.make_embedding(load=True)
user = preprocess.sentence_word2idx()

y = preprocess_user.labels_to_tensor(y_t)

Get embedding ...
loading word2vec model ...


  self.embedding_matrix.append(self.embedding[word])
  self.embedding_matrix = torch.tensor(self.embedding_matrix)


total words: 28173
Get embedding ...
loading word2vec model ...


  self.embedding_matrix.append(self.embedding[word])


total words: 130209


In [18]:
def train_test_val_split(x1,x2,y, ratio_train, ratio_test, ratio_val):
    x1_train, x1_middle,x2_train, x2_middle,y_train, y_middle = train_test_split(x1,x2,y, test_size=1-ratio_train, random_state=20)
    ratio = ratio_val/(ratio_test + ratio_val)
    x1_test, x1_validation,x2_test, x2_validation,y_test, y_validation = train_test_split(x1_middle,x2_middle,y_middle, test_size=ratio, random_state=20)
    return x1_train, x1_test, x1_validation,x2_train, x2_test, x2_validation,y_train, y_test, y_validation

In [None]:
# train_dataset, test_dataset, val_dataset = train_test_val_split(dataset, 0.6, 0.2, 0.2)

In [19]:
train_x, test_x,val_x,train_user,test_user,val_user,train_y,test_y,val_y=train_test_val_split(x,user,y, 0.6, 0.2, 0.2)

In [None]:
# sen_len_user = 50
# preprocess_user = Preprocess(val_x_t, sen_len_user, w2v_path="./word2vec1.model")
# embedding_user = preprocess_user.make_embedding(load=True)
# val_x = preprocess_user.sentence_word2idx()
#
# sen_len = 200
# preprocess = Preprocess(val_user_t, sen_len, w2v_path="./word2vec2.model")
# embedding2 = preprocess.make_embedding(load=True)
# val_user = preprocess.sentence_word2idx()
#
# val_y = preprocess_user.labels_to_tensor(val_y_t)

In [None]:
# sen_len_user = 50
# preprocess_user = Preprocess(test_x_t, sen_len_user, w2v_path="./word2vec1.model")
# embedding1 = preprocess_user.make_embedding(load=True)
# test_x = preprocess_user.sentence_word2idx()
#
# sen_len = 200
# preprocess = Preprocess(test_user_t, sen_len, w2v_path="./word2vec2.model")
# embedding2 = preprocess.make_embedding(load=True)
# test_user = preprocess.sentence_word2idx()
#
# test_y = preprocess_user.labels_to_tensor(test_y_t)

In [20]:
# dataset构建
train_dataset = JobUserDataset(train_x, train_user, train_y)
val_dataset = JobUserDataset(val_x, val_user, val_y)
test_dataset = JobUserDataset(test_x, test_user, test_y)

In [21]:
batch_size = 32 # 一次训练所选取的样本数
# dataset导入
train_loader = DataLoader(dataset= train_dataset, batch_size = batch_size, shuffle = False)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle = False)
test_loader =DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle = False)

In [22]:
def training(n_epoch, lr, train, valid, model, device, model_name, model_dir="./"):
    # summary model parameters
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print("\nstart training, total parameter:{}, trainable:{}\n".format(total, trainable))
    model.cuda()
    model.train()
    criterion = nn.BCELoss()
    t_batch = len(train)
    v_batch = len(valid)
    optimizer = optim.Adam(model.parameters(), lr=lr) #, weight_decay=1e-4
    # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epoch, eta_min=0, last_epoch=-1)
    # total_loss, total_acc = 0, 0
    best_acc, best_precision, best_recall, best_f1, best_auc = 0, 0, 0, 0, 0

    for epoch in range(n_epoch):
        start_time = time.time()
        total_loss, total_acc = 0, 0
        pred_label = []
        y_label = []
        # training
        for i, (jobs, users, labels) in enumerate(train):

            # 放GPU上运行
            jobs = jobs.to(torch.float32)
            jobs = jobs.to(device)

            users = users.to(torch.float32)
            users = users.to(device)

            # entities = entities.to(torch.float32)
            # entities = entities.to(device)

            labels = labels.to(torch.float32)
            labels = labels.to(device)

            # TODO 是否考虑模型用多个优化器？
            optimizer.zero_grad() # 将所有模型参数的梯度置为0
            # model.zero_grad() # 除所有可训练的torch.Tensor的梯度
            outputs = model(jobs, users)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pred_label.extend([0 if i<0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
            y_label.extend(list(labels.cpu().detach().numpy()))
        train_losses = total_loss/t_batch
        train_acc = accuracy_score(y_label, pred_label)
        train_precision = precision_score(y_label, pred_label)
        train_recall = recall_score(y_label, pred_label)
        train_auc = roc_auc_score(y_label, pred_label)
        train_f1 = f1_score(y_label, pred_label)
        print('[ Epoch{}: {}/{}] '.format(epoch+1, i+1, t_batch))
        print('\nTrain | Loss:{:.5f} ACC:{:.5f} Precision:{:.5f} Recall:{:.5f} AUC:{:.5f} F1:{:.5f} Time:{:.6f}'.format(train_losses,train_acc,train_precision, train_recall,train_auc,train_f1, time.time()-start_time))

        # evaluation
        model.eval()
        with torch.no_grad():
            # pred_score = []
            pred_label = []
            y_label = []
            total_loss, total_acc = 0, 0
            for i, (jobs, users, labels) in enumerate(valid):
                # 放GPU上运行
                jobs = jobs.to(torch.float32)
                jobs = jobs.to(device)

                users = users.to(torch.float32)
                users = users.to(device)

                # entities = entities.to(torch.float32)
                # entities = entities.to(device)

                labels = labels.to(torch.float32)
                labels = labels.to(device)

                outputs = model(jobs, users)

                loss = criterion(outputs, labels)
                total_loss += loss.item()
                '''
                存一下预测score
                '''
                # pred_score.extend([j for j in list(outputs.cpu().detach().numpy())])
                pred_label.extend([0 if i < 0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
                y_label.extend(list(labels.cpu().detach().numpy()))
            # print('\nVal | Loss:{:.5f} Time:{:.6f}'.format(total_loss/v_batch, time.time()-start_time))
            val_losses = total_loss/v_batch
            val_acc = accuracy_score(y_label, pred_label)
            val_precision = precision_score(y_label, pred_label)
            val_recall = recall_score(y_label, pred_label)
            val_auc = roc_auc_score(y_label, pred_label)
            val_f1 = f1_score(y_label, pred_label)
            print('\nVal | Loss:{:.5f} ACC:{:.5f} Precision:{:.5f} Recall:{:.5f} AUC:{:.5f} F1:{:.5f} Time:{:.6f}'.format(val_losses,val_acc,val_precision, val_recall,val_auc,val_f1, time.time()-start_time))
            if val_acc > best_acc:
                best_acc = val_acc
                best_precision = val_precision
                best_recall = val_recall
                best_f1 = val_f1
                best_auc = val_auc
                torch.save(model, "{}/{}.model".format(model_dir, model_name))
                print('save model with acc: {:.3f}, recall: {:.3f}, auc: {:.3f}'.format(best_acc,best_recall,best_auc))
        print('------------------------------------------------------')
        # lr_scheduler.step()
        # 将model的模式设为train，这样optimizer就可以更新model的參數（因為刚刚转为eval模式）
        model.train()
    return best_acc, best_precision, best_recall, best_f1, best_auc

In [23]:
fix_embedding = False
# input_dim = train_dataset[0][1].shape[0]
model = PJFNN(embedding1, embedding2, dropout=0.7, channels=32, fix_embedding=fix_embedding)
epoch = 20
lr = 0.0001
model_dir = './'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'PJFNN'

In [24]:
best_acc, best_precision, best_recall, best_f1, best_auc = training(epoch, lr, train_loader, val_loader, model, device, model_name, model_dir)

# 输出结果（验证集）
print('best_acc',best_acc)
print('best_precision',best_precision)
print('best_recall',best_precision)
print('best_f1',best_f1)
print('best_auc',best_auc)


start training, total parameter:31716849, trainable:31716849

[ Epoch1: 2592/2592] 

Train | Loss:0.66579 ACC:0.57631 Precision:0.58125 Recall:0.55026 AUC:0.57634 F1:0.56533 Time:43.162059

Val | Loss:0.60337 ACC:0.66026 Precision:0.68374 Recall:0.58670 AUC:0.65971 F1:0.63152 Time:46.856344
save model with acc: 0.660, recall: 0.587, auc: 0.660
------------------------------------------------------
[ Epoch2: 2592/2592] 

Train | Loss:0.59969 ACC:0.66753 Precision:0.68806 Recall:0.61470 AUC:0.66760 F1:0.64931 Time:43.385087

Val | Loss:0.56625 ACC:0.69466 Precision:0.72480 Recall:0.62009 AUC:0.69410 F1:0.66837 Time:47.012575
save model with acc: 0.695, recall: 0.620, auc: 0.694
------------------------------------------------------
[ Epoch3: 2592/2592] 

Train | Loss:0.56488 ACC:0.70387 Precision:0.72231 Recall:0.66377 AUC:0.70392 F1:0.69180 Time:43.724545

Val | Loss:0.54505 ACC:0.71908 Precision:0.72749 Recall:0.69371 AUC:0.71888 F1:0.71020 Time:47.402593
save model with acc: 0.719, r

In [28]:
def testing(model, test_loader):
    pred_label = []
    y_label = []
    model.eval()
    with torch.no_grad():
        for i, (jobs, users, labels) in enumerate(test_loader):
            # 放GPU上运行
            jobs = jobs.to(torch.float32)
            jobs = jobs.to(device)

            users = users.to(torch.float32)
            users = users.to(device)

            # entities = entities.to(torch.float32)
            # entities = entities.to(device)

            labels = labels.to(torch.float32)
            labels = labels.to(device)

            outputs = model(jobs, users)

            pred_label.extend([0 if i < 0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
            y_label.extend(list(labels.cpu().detach().numpy()))

        test_acc = accuracy_score(y_label, pred_label)
        test_precision = precision_score(y_label, pred_label)
        test_recall = recall_score(y_label, pred_label)
        test_auc = roc_auc_score(y_label, pred_label)
        test_f1 = f1_score(y_label, pred_label)
    return test_acc, test_auc, test_precision, test_recall, test_f1

In [29]:
# 输出结果(测试集)
test_acc, test_auc, test_precision, test_recall, test_f1 = testing(
    torch.load('/root/PJFNN.model'), test_loader)
print('test_acc', test_acc)
print('test_precision', test_precision)
print('test_recall', test_precision)
print('test_f1', test_f1)
print('test_auc', test_auc)

test_acc 0.7421513310185185
test_precision 0.7405768132495717
test_recall 0.7405768132495717
test_f1 0.7442694694551064
test_auc 0.7421322330311165
