In [1]:
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import numpy as np
from tqdm import tqdm
import time
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report

import torch
from torch.utils import data
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import jieba.posseg as pseg
import jieba

from gensim.models import word2vec, Word2Vec
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv('dataset_user_job_all_1.csv', dtype = {'UserID': 'str', 'JobID': 'str','label': 'str'})

In [3]:
segment_job =[]
# job_set=pd.read_csv('job_information.csv')
for content in tqdm(dataset["岗位描述"].values):
#     segment.append(pretreatment(content))
    segment_job.append(list(jieba.cut(content)))
dataset["text_job"] = segment_job
# job_set.to_csv("job_set_segment.csv",index=False)

  0%|          | 0/138238 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.808 seconds.
Prefix dict has been built successfully.
100%|██████████| 138238/138238 [02:24<00:00, 956.71it/s] 


In [4]:
segment_user = []
# job_set=pd.read_csv('job_information.csv')
for content in tqdm(dataset["resume"].values):
#     segment.append(pretreatment(content))
    segment_user.append(list(jieba.cut(content)))
dataset["text_user"] = segment_user
# user_set.to_csv("user_set_segment.csv",index=False)

100%|██████████| 138238/138238 [12:47<00:00, 180.22it/s]


In [5]:
# 历史投递数据
dataset_pos = dataset[dataset.label == '1']

In [6]:
# 按照 userid 进行group
memory_list_u = {}
memory_num_u = {}
groups = dataset_pos.groupby("UserID")
for idx, group in tqdm(groups):
    # print(group.text_job)
    seq_len = len(group)
    memory_num_u[idx] = seq_len
    seq_list = []
    for text in group.text_job: #.unique().tolist()
        seq_list.append(text)
    memory_list_u[idx] = seq_list

100%|██████████| 9091/9091 [00:03<00:00, 2331.71it/s] 


In [7]:
# 按照 jobid 进行group
memory_list_j = {}
memory_num_j = {}
groups = dataset_pos.groupby("JobID")
for idx, group in tqdm(groups):
    seq_len = len(group)
    memory_num_j[idx] = seq_len
    seq_list = []
    for text in group.text_user: #.unique().tolist()
        seq_list.append(text)
    memory_list_j[idx] = seq_list

100%|██████████| 18685/18685 [00:03<00:00, 5566.15it/s] 


In [8]:
list1 = []
list2 = []
list3 = []
list4 = []

for i in range(len(dataset)):
    userid = dataset.loc[i,'UserID']
    jobid = dataset.loc[i, 'JobID']
    list1.append(memory_num_u[userid])
    list2.append(memory_list_u[userid])
    try:
        list3.append(memory_num_j[jobid])
        list4.append(memory_list_j[jobid])
    except:
        list3.append(0)
        list4.append([])

dataset['memory_num_u'] = list1
dataset['memory_list_u'] = list2
dataset['memory_num_j'] = list3
dataset['memory_list_j'] =list4

In [2]:
class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        '''
        param: sentences: the list of corpus
               sen_len: the max length of each sentence
               w2v_path: the path storing word emnbedding model
        '''

        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []

    def get_w2v_model(self):
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size

    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)

    def make_embedding(self, load=True):
        print("Get embedding ...")
        if load:
            print("loading word2vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        for i, word in enumerate(self.embedding.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding("")
        self.add_embedding("")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix

    def pad_sentence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx[''])
        assert len(sentence) == self.sen_len
        return sentence

    def sentence_word2idx(self):
        '''
        change words in sentences into idx in embedding_matrix
        '''
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx[''])
            sentence_idx = self.pad_sentence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)

    def labels_to_tensor(self, y):
        return torch.LongTensor(y)

In [3]:
class Preprocess1():
    def __init__(self, sentences_list, sen_len, w2v_path="./w2v.model"):
        '''
        param: sentences: the list of corpus
               sen_len: the max length of each sentence
               w2v_path: the path storing word emnbedding model
        '''

        self.w2v_path = w2v_path
        self.sentences_list = sentences_list
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []

    def get_w2v_model(self):
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size

    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)

    def make_embedding(self, load=True):
        print("Get embedding ...")
        if load:
            print("loading word2vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        for i, word in enumerate(self.embedding.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding("")
        self.add_embedding("")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix

    def pad_sentence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx[''])
        assert len(sentence) == self.sen_len
        return sentence

    def sentence_word2idx(self, sentences):
        '''
        change words in sentences into idx in embedding_matrix
        '''
        sentence_list = []

        # padding and cutting
        if len(sentences) < 10:
            for i in range(10-len(sentences)):
                sentences.append('')
        elif len(sentences) > 10:
            sentences = sentences[: 10]

        for i, sen in enumerate(sentences):
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx[''])
            sentence_idx = self.pad_sentence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)

    def sentencelist_2idx(self):
        return torch.stack([self.sentence_word2idx(sentence) for sentence in self.sentences_list], dim=0)

    def labels_to_tensor(self, y):
        return torch.LongTensor(y)

In [4]:
class JobUserDataset(data.Dataset):
    def __init__(self, job, job_m, job_len, user, user_m, user_len, label):
        self.job = job
        self.job_m = job_m
        self.job_len = job_len
        self.user = user
        self.user_m = user_m
        self.user_len = user_len
        self.label = label

    def __getitem__(self, idx):
        if self.label is None:
            return self.job[idx], self.job_m[idx], self.job_len[idx],self.user[idx], self.user_m[idx], self.user_len[idx]
        return self.job[idx], self.job_m[idx], self.job_len[idx],self.user[idx], self.user_m[idx], self.user_len[idx], self.label[idx]

    def __len__(self):
        return len(self.job)

In [12]:
x_t1 = dataset['text_job']
x_t2 = dataset['memory_num_u']
x_t3 = dataset['memory_list_u']

user_t1 = dataset['text_user']
user_t2 = dataset['memory_num_j']
user_t3 = dataset['memory_list_j']

y_t = dataset['label']

In [13]:
sen_len_job = 200
preprocess_job1 = Preprocess(x_t1, sen_len_job, w2v_path="word2vec_model/word2vec_shared.model")
embedding1 = preprocess_job1.make_embedding(load=True)
job1 = preprocess_job1.sentence_word2idx()

Get embedding ...
loading word2vec model ...


  self.embedding_matrix.append(self.embedding[word])
  self.embedding_matrix = torch.tensor(self.embedding_matrix)


total words: 137703


In [14]:
torch.save(embedding1,"baseline_dataset/embedding3.pt")

In [5]:
embedding1 = torch.load("baseline_dataset/embedding3.pt")

In [16]:
# preprocess_job2 = Preprocess(x_t2, sen_len_job, w2v_path="word2vec_model/word2vec1.model")
# embedding2 = preprocess_job2.make_embedding(load=True)
job2 = torch.LongTensor(x_t2)

preprocess_job3 = Preprocess1(x_t3, sen_len_job, w2v_path="word2vec_model/word2vec_shared.model")
embedding3 = preprocess_job3.make_embedding(load=True)
job3 = preprocess_job3.sentencelist_2idx()

Get embedding ...
loading word2vec model ...


  self.embedding_matrix.append(self.embedding[word])


total words: 137703


In [17]:
sen_len_user = 200
preprocess_user1 = Preprocess(user_t1, sen_len_user, w2v_path="word2vec_model/word2vec_shared.model")
embedding4 = preprocess_user1.make_embedding(load=True)
user1 = preprocess_user1.sentence_word2idx()

# preprocess_user2 = Preprocess(user_t1, sen_len_user, w2v_path="word2vec_model/word2vec2.model")
# embedding5 = preprocess_user2.make_embedding(load=True)
user2 = torch.LongTensor(user_t2)

Get embedding ...
loading word2vec model ...


  self.embedding_matrix.append(self.embedding[word])


total words: 137703


In [18]:
preprocess_user3 = Preprocess1(user_t1, sen_len_user, w2v_path="word2vec_model/word2vec_shared.model")
embedding6 = preprocess_user3.make_embedding(load=True)
user3 = preprocess_user3.sentencelist_2idx()

Get embedding ...
loading word2vec model ...


  self.embedding_matrix.append(self.embedding[word])


total words: 137703


In [19]:
y_t1 = [int (num) for num in y_t]
y = torch.LongTensor(y_t1)

In [20]:
def train_test_val_split(x1,x2,x3,x4,x5,x6,y, ratio_train, ratio_test, ratio_val):
    x1_train, x1_middle,x2_train, x2_middle,x3_train, x3_middle,x4_train, x4_middle,x5_train, x5_middle,x6_train, x6_middle,y_train, y_middle = train_test_split(x1,x2,x3,x4,x5,x6,y, test_size=1-ratio_train, random_state=20)
    ratio = ratio_val/(ratio_test + ratio_val)
    x1_test, x1_validation,x2_test, x2_validation,x3_test, x3_validation,x4_test, x4_validation,x5_test, x5_validation,x6_test, x6_validation,y_test, y_validation = train_test_split(x1_middle,x2_middle,x3_middle,x4_middle,x5_middle,x6_middle,y_middle, test_size=ratio, random_state=20)
    return x1_train, x1_test, x1_validation,x2_train, x2_test, x2_validation,x3_train, x3_test, x3_validation,x4_train, x4_test, x4_validation,x5_train, x5_test, x5_validation,x6_train, x6_test, x6_validation,y_train, y_test, y_validation

In [21]:
x1_train, x1_test, x1_validation,x2_train, x2_test, x2_validation,x3_train, x3_test, x3_validation,x4_train, x4_test, x4_validation,x5_train, x5_test, x5_validation,x6_train, x6_test, x6_validation,y_train, y_test, y_validation = train_test_val_split(job1,job2,job3,user1,user2,user3,y, 0.6, 0.2, 0.2)

In [22]:
# dataset构建
train_dataset = JobUserDataset(x1_train, x2_train, x3_train,x4_train,x5_train,x6_train,y_train)
val_dataset = JobUserDataset(x1_test, x2_test, x3_test,x4_test,x5_test,x6_test,y_test)
test_dataset = JobUserDataset(x1_validation, x2_validation, x3_validation,x4_validation,x5_validation,x6_validation,y_validation)

In [None]:
# 存储dataset
torch.save(train_dataset, "baseline_dataset/train_1.dataset")
torch.save(val_dataset, "baseline_dataset/val_1.dataset")
torch.save(test_dataset, "baseline_dataset/test_1.dataset")

In [6]:
# 导入dataset
train_dataset = torch.load("baseline_dataset/train_1.dataset")
val_dataset = torch.load("baseline_dataset/val_1.dataset")
test_dataset = torch.load("baseline_dataset/test_1.dataset")

In [7]:
batch_size = 32 # 一次训练所选取的样本数
# dataset导入
train_loader = DataLoader(dataset= train_dataset, batch_size = batch_size, shuffle = False)
val_loader = DataLoader(dataset = val_dataset, batch_size = batch_size, shuffle = False)
test_loader =DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle = False)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
class JRMPM(torch.nn.Module):
    def __init__(self, word_embeddings):
        super(JRMPM, self).__init__()

        # profile: word embeddings for look_up
        # embedding_matrix = [[0...0], [...], ...[]]
        self.word_embeddings = torch.nn.Embedding.from_pretrained(word_embeddings, padding_idx=0)
        self.word_embeddings.weight.requires_grad = False

        # BI-GRU: int(USER_EMBED_DIM/2) * 2 = USER_EMBED_DIM
        self.expect_words_gru = torch.nn.GRU(input_size=WORD_EMBED_DIM, hidden_size=int(USER_EMBED_DIM/2),
                                    num_layers=1, batch_first=True, bidirectional=True)
        self.job_words_gru = torch.nn.GRU(input_size=WORD_EMBED_DIM, hidden_size=int(USER_EMBED_DIM/2),
                                    num_layers=1, batch_first=True, bidirectional=True)

        # GRU: USER_EMBED_DIM
        self.expect_sent_gru = torch.nn.GRU(input_size=USER_EMBED_DIM, hidden_size=USER_EMBED_DIM,
                                    num_layers=1, batch_first=True, bidirectional=False)
        self.job_sent_gru = torch.nn.GRU(input_size=USER_EMBED_DIM, hidden_size=USER_EMBED_DIM,
                                    num_layers=1, batch_first=True, bidirectional=False)

        # memory profiling
        self.expect_momery = torch.nn.Embedding(MAX_PROFILELEN, USER_EMBED_DIM)
        self.expect_momery.weight.requires_grad = True
        self.job_momery = torch.nn.Embedding(MAX_PROFILELEN, USER_EMBED_DIM)
        self.job_momery.weight.requires_grad = True


        # update pi: beta, gamma
        self.expect_update_pi = torch.nn.Sequential(
            torch.nn.Linear(MAX_PROFILELEN, MAX_PROFILELEN, bias=False),
            torch.nn.Tanh(),
            torch.nn.Softmax(dim=-2)
        )
        self.job_update_pi = torch.nn.Sequential(
            torch.nn.Linear( MAX_PROFILELEN,  MAX_PROFILELEN, bias=False),
            torch.nn.Tanh(),
            torch.nn.Softmax(dim=-2)
        )

        # update g:
        self.expect_g_update = torch.nn.Sequential(
            torch.nn.Linear(3 * USER_EMBED_DIM, 1, bias=False),
            torch.nn.Sigmoid()
        )
        self.job_g_update = torch.nn.Sequential(
            torch.nn.Linear(3 * USER_EMBED_DIM, 1, bias=False),
            torch.nn.Sigmoid()
        )

        # read phi: alpha
        self.expect_read_phi = torch.nn.Sequential(
            torch.nn.Linear(MAX_PROFILELEN, MAX_PROFILELEN, bias=False),
            torch.nn.Tanh(),
            torch.nn.Softmax(dim=-2)
        )
        self.job_read_phi = torch.nn.Sequential(
            torch.nn.Linear(MAX_PROFILELEN, MAX_PROFILELEN, bias=False),
            torch.nn.Tanh(),
            torch.nn.Softmax(dim=-2)
        )
        # read g:
        self.expect_g_read = torch.nn.Sequential(
            torch.nn.Linear(3 * USER_EMBED_DIM, 1, bias=False),
            torch.nn.Sigmoid()
        )
        self.job_g_read = torch.nn.Sequential(
            torch.nn.Linear(3 * USER_EMBED_DIM, 1, bias=False),
            torch.nn.Sigmoid()
        )

        # match
        self.MLP = torch.nn.Sequential(
            torch.nn.Linear(2 * MAX_PROFILELEN * USER_EMBED_DIM,  MAX_PROFILELEN * USER_EMBED_DIM),
            torch.nn.Tanh(),
            torch.nn.Linear(MAX_PROFILELEN * USER_EMBED_DIM, 1),
            torch.nn.Sigmoid()
        )

    # profiles: [batch_size, MAX_PROFILELEN, MAX_TERMLEN] = (40, 15, 50), word idx
    def __words_BiGRU__(self, profiles, isexpect=True):
        # word level:
        shape = profiles.shape # [132, 20, 50]
        profiles_ = profiles.contiguous().view([-1, shape[-1]])
        # sort expects_sample_: large to small
        # sorted [batch_size * MAX_PROFILELEN, MAX_TERMLEN](40 * 15, 50)
        lens = (profiles_ > 0).sum(dim=-1)
        lens_sort, ind_sort = lens.sort(dim=0, descending=True)
        profiles_sort = profiles_[ind_sort]
        # embeddings: [batch_size * MAX_PROFILELEN, MAX_TERMLEN, EMBED_DIM]
        profile_embed = self.word_embeddings(profiles_sort).float()
        profile_pack = pack_padded_sequence(profile_embed, lens_sort.cpu(), batch_first=True)
        if isexpect:
            _, sent_hidden = self.expect_words_gru(profile_pack)
        else:
            _, sent_hidden = self.job_words_gru(profile_pack)
        # [2640, 2, 50]
        sent_hidden = sent_hidden.permute(1, 0, 2).contiguous().view([-1, USER_EMBED_DIM])
        sent_hidden = sent_hidden[ind_sort].view([shape[0], shape[1], -1])
        # [132, 20, 100]
        return sent_hidden

    # sents: [batch_size, MAX_PROFILELEN, dim]
    def __sents_GRU__(self, sent_hidden, isexpect=True):
        if isexpect:
            out, _ = self.expect_sent_gru(sent_hidden)
        else:
            out, _ = self.job_sent_gru(sent_hidden)
        return out

    def profile2sent(self, profiles, isexpect):
        return self.__sents_GRU__(self.__words_BiGRU__(profiles, isexpect), isexpect)

    # memory:  [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
    # a_sents: [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
    # b_sents: [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
    # col_mask: [batch]
    def update(self, memory, a_sents, b_sents, col_mask, isexpect=True):
        if isexpect:
            # [batch, n, n*]
            # print(torch.bmm(memory, a_sents.permute(0, 2, 1)).size())
            beta = self.expect_update_pi(torch.bmm(memory, a_sents.permute(0, 2, 1)))
            gamma = self.expect_update_pi(torch.bmm(memory, b_sents.permute(0, 2, 1)))
        else:
            beta = self.job_update_pi(torch.bmm(memory, a_sents.permute(0, 2, 1)))
            gamma = self.job_update_pi(torch.bmm(memory, b_sents.permute(0, 2, 1)))

        # [batch, n, n*] * [batch, n, dim] = [batch, n, dim]
        i_update = torch.bmm(beta, a_sents) + torch.bmm(gamma, b_sents)
        # [batch, n, dim]
        if isexpect:
            g_update = self.expect_g_update(torch.cat([memory, i_update, memory * i_update], dim=-1))
        else:
            g_update = self.job_g_update(torch.cat([memory, i_update, memory * i_update], dim=-1))
        # m_{k+1}
        # [batch, MAX_PROFILELEN, USER_EMBED_DIM]
        memory_update = g_update * memory + (1-g_update) * memory
        # mask
        shape = memory_update.shape
        memory_update_mask = (torch.unsqueeze(col_mask, 1) * memory_update.view([shape[0], -1])).view(shape)
        memory_noupdate_mask = (torch.unsqueeze(1.-col_mask, 1) * memory.contiguous().view([shape[0], -1])).view(shape)
        return memory_update_mask + memory_noupdate_mask

    # memory: [batch, n, dim] [1, 20, 100]
    # hidden_last: [batch, n, dim] [1, 20, 100]
    # a_sents: [batch, n, dim] [1, 20, 100]
    def read(self, memory, hidden_last, a_sents, isexpect=True):
        # [batch, n, n*]
        if isexpect:
            alpha = self.expect_read_phi(torch.bmm(memory, (hidden_last * a_sents).permute(0, 2, 1)))
        else:
            alpha = self.job_read_phi(torch.bmm(memory, (hidden_last * a_sents).permute(0, 2, 1)))

        # [batch, n, n*] * [batch, n, dim] = [batch, n, dim]
        i_read = torch.bmm(alpha, memory)
        # [batch, n, dim],
        if isexpect:
            g_read = self.expect_g_read(torch.cat([a_sents, i_read, a_sents * i_read], dim=-1))
        else:
            g_read = self.job_g_read(torch.cat([a_sents, i_read, a_sents * i_read], dim=-1))

        # [batch, n, dim]
        hidden = g_read * i_read + (1 - g_read) * hidden_last
        return hidden

    # a_profiles: [batch, sent, word] [1, 20, 50], tensor
    # b_profiless: [batch, max_seq_len, sent, word] [1, 3, 20, 50], tensor
    # b_seq_lens: [], list
    def process_seq(self, a_profiles, b_seq_profiless, b_seq_lens, isexpect=True):
        # [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
        batch_a_sents = self.profile2sent(a_profiles, isexpect)
        # [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
        batch_memory = batch_hidden = self.profile2sent(a_profiles, not isexpect)
        # print(max(b_seq_lens))
        for i in range(10):
            # [1,0,... ]
            col_mask = torch.from_numpy((np.array(b_seq_lens.cpu())-i>0)+0.).float().to(device)
            # col_mask = torch.Tensor((np.array(10)-i>0)+0.).float().to(device)
            # [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
            batch_b_sents = self.profile2sent(b_seq_profiless[:, i, :, :], not isexpect)
            # batch_memory:  [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
            # batch_a_sents: [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
            # batch_b_sents: [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
            # batch_memory:  [batch, MAX_PROFILELEN, USER_EMBED_DIM] [1, 20, 100]
            batch_memory = self.update(batch_memory, batch_a_sents,batch_b_sents, col_mask, isexpect)
            batch_hidden = self.read(batch_memory, batch_hidden, batch_a_sents, isexpect)

        return batch_hidden

    # [100, 20, 100] [100, 20, 100]
    def predict(self, expect_hidden, job_hidden):
        expect_hidden_ = expect_hidden.reshape([expect_hidden.shape[0], -1])
        job_hidden_ = job_hidden.reshape([job_hidden.shape[0], -1])
        return self.MLP(torch.cat([expect_hidden_, job_hidden_], -1))

    def forward(self, job1, job2, job3, user1, user2, user3):
        job_hidden = self.process_seq(job1.long().unsqueeze(1), job3.long().unsqueeze(2), job2.long())
        user_hidden = self.process_seq(user1.long().unsqueeze(1), user3.long().unsqueeze(2), user2.long())
        x = self.predict(job_hidden, user_hidden).squeeze(1)
        return x

In [10]:
def training(n_epoch, lr, train, valid, model, device, model_name, model_dir="./"):
    # summary model parameters
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print("\nstart training, total parameter:{}, trainable:{}\n".format(total, trainable))
    model.cuda()
    model.train()
    criterion = nn.BCELoss()
    t_batch = len(train)
    v_batch = len(valid)
    optimizer = optim.Adam(model.parameters(), lr=lr) #, weight_decay=1e-4
    # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epoch, eta_min=0, last_epoch=-1)
    # total_loss, total_acc = 0, 0
    best_acc, best_precision, best_recall, best_f1, best_auc = 0, 0, 0, 0, 0

    for epoch in range(n_epoch):
        start_time = time.time()
        total_loss, total_acc = 0, 0
        pred_label = []
        y_label = []
        # training
        for i, (jobs1, jobs2, jobs3, users1, users2, users3,labels) in enumerate(valid):
            # 放GPU上运行
            jobs1 = jobs1.to(torch.float32)
            jobs1 = jobs1.to(device)

            jobs2 = jobs2.to(torch.float32)
            jobs2 = jobs2.to(device)

            jobs3 = jobs3.to(torch.float32)
            jobs3 = jobs3.to(device)

            users1 = users1.to(torch.float32)
            users1 = users1.to(device)

            users2 = users2.to(torch.float32)
            users2 = users2.to(device)

            users3 = users3.to(torch.float32)
            users3 = users3.to(device)

            labels = labels.to(torch.float32)
            labels = labels.to(device)
            
            # TODO 是否考虑模型用多个优化器？
            optimizer.zero_grad() # 将所有模型参数的梯度置为0
            # model.zero_grad() # 除所有可训练的torch.Tensor的梯度
            outputs = model(jobs1, jobs2, jobs3, users1, users2, users3)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pred_label.extend([0 if i<0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
            y_label.extend(list(labels.cpu().detach().numpy()))
        train_losses = total_loss/t_batch
        train_acc = accuracy_score(y_label, pred_label)
        train_precision = precision_score(y_label, pred_label)
        train_recall = recall_score(y_label, pred_label)
        train_auc = roc_auc_score(y_label, pred_label)
        train_f1 = f1_score(y_label, pred_label)
        print('[ Epoch{}: {}/{}] '.format(epoch+1, i+1, t_batch))
        print('\nTrain | Loss:{:.5f} ACC:{:.5f} Precision:{:.5f} Recall:{:.5f} AUC:{:.5f} F1:{:.5f} Time:{:.6f}'.format(train_losses,train_acc,train_precision, train_recall,train_auc,train_f1, time.time()-start_time))

        # evaluation
        model.eval()
        with torch.no_grad():
            # pred_score = []
            pred_label = []
            y_label = []
            total_loss, total_acc = 0, 0
            for i, (jobs1, jobs2, jobs3, users1, users2, users3,labels) in enumerate(valid):
                # 放GPU上运行
                jobs1 = jobs1.to(torch.float32)
                jobs1 = jobs1.to(device)

                jobs2 = jobs2.to(torch.float32)
                jobs2 = jobs2.to(device)

                jobs3 = jobs3.to(torch.float32)
                jobs3 = jobs3.to(device)

                users1 = users1.to(torch.float32)
                users1 = users1.to(device)

                users2 = users2.to(torch.float32)
                users2 = users2.to(device)

                users3 = users3.to(torch.float32)
                users3 = users3.to(device)

                labels = labels.to(torch.float32)
                labels = labels.to(device)

                outputs = model(jobs1, jobs2, jobs3, users1, users2, users3)

                loss = criterion(outputs, labels)
                total_loss += loss.item()
                '''
                存一下预测score
                '''
                # pred_score.extend([j for j in list(outputs.cpu().detach().numpy())])
                pred_label.extend([0 if i < 0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
                y_label.extend(list(labels.cpu().detach().numpy()))

            val_losses = total_loss/v_batch
            val_acc = accuracy_score(y_label, pred_label)
            val_precision = precision_score(y_label, pred_label)
            val_recall = recall_score(y_label, pred_label)
            val_auc = roc_auc_score(y_label, pred_label)
            val_f1 = f1_score(y_label, pred_label)
            print('\nVal | Loss:{:.5f} ACC:{:.5f} Precision:{:.5f} Recall:{:.5f} AUC:{:.5f} F1:{:.5f} Time:{:.6f}'.format(val_losses,val_acc,val_precision, val_recall,val_auc,val_f1, time.time()-start_time))
            if val_acc > best_acc:
                best_acc = val_acc
                best_precision = val_precision
                best_recall = val_recall
                best_f1 = val_f1
                best_auc = val_auc
                torch.save(model, "{}/{}.model".format(model_dir, model_name))
                print('save model with acc: {:.3f}, recall: {:.3f}, auc: {:.3f}'.format(best_acc,best_recall,best_auc))
        print('------------------------------------------------------')
        # lr_scheduler.step()
        # 将model的模式设为train，这样optimizer就可以更新model的參數（因為刚刚转为eval模式）
        model.train()
    return best_acc, best_precision, best_recall, best_f1, best_auc

In [19]:
WORD_EMBED_DIM = 200
USER_EMBED_DIM = 200

MAX_PROFILELEN = 1
MAX_TERMLEN = 30

In [20]:
fix_embedding = False
# input_dim = train_dataset[0][1].shape[0]
model = JRMPM(embedding1)
epoch = 20
lr = 0.001
model_dir = './'
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'JRMPM'

In [21]:
best_acc, best_precision, best_recall, best_f1, best_auc = training(epoch, lr, train_loader, val_loader, model, device, model_name, model_dir)

# 输出结果（验证集）
print('best_acc',best_acc)
print('best_precision',best_precision)
print('best_recall',best_precision)
print('best_f1',best_f1)
print('best_auc',best_auc)


start training, total parameter:28468605, trainable:928005

[ Epoch1: 864/2592] 

Train | Loss:0.23125 ACC:0.49949 Precision:0.50094 Recall:0.59687 AUC:0.49918 F1:0.54471 Time:278.990605

Val | Loss:0.69313 ACC:0.50130 Precision:0.50147 Recall:0.99560 AUC:0.49969 F1:0.66699 Time:357.354973
save model with acc: 0.501, recall: 0.996, auc: 0.500
------------------------------------------------------
[ Epoch2: 864/2592] 

Train | Loss:0.23099 ACC:0.50958 Precision:0.50930 Recall:0.61180 AUC:0.50925 F1:0.55586 Time:271.856415

Val | Loss:0.69305 ACC:0.50539 Precision:0.50360 Recall:0.97866 AUC:0.50384 F1:0.66500 Time:350.289876
save model with acc: 0.505, recall: 0.979, auc: 0.504
------------------------------------------------------
[ Epoch3: 864/2592] 

Train | Loss:0.23051 ACC:0.51450 Precision:0.51317 Recall:0.62665 AUC:0.51414 F1:0.56426 Time:273.124255

Val | Loss:0.69289 ACC:0.51837 Precision:0.51389 Recall:0.73740 AUC:0.51766 F1:0.60569 Time:351.709325
save model with acc: 0.518, 

In [22]:
def testing(model, test_loader):
    pred_label = []
    y_label = []
    model.eval()
    with torch.no_grad():
        for i, (jobs1, jobs2, jobs3, users1, users2, users3,labels) in enumerate(test_loader):
            # 放GPU上运行
            jobs1 = jobs1.to(torch.float32)
            jobs1 = jobs1.to(device)

            jobs2 = jobs2.to(torch.float32)
            jobs2 = jobs2.to(device)

            jobs3 = jobs3.to(torch.float32)
            jobs3 = jobs3.to(device)

            users1 = users1.to(torch.float32)
            users1 = users1.to(device)

            users2 = users2.to(torch.float32)
            users2 = users2.to(device)

            users3 = users3.to(torch.float32)
            users3 = users3.to(device)

            labels = labels.to(torch.float32)
            labels = labels.to(device)

            outputs = model(jobs1, jobs2, jobs3, users1, users2, users3)

            pred_label.extend([0 if i < 0.5 else 1 for i in list(outputs.cpu().detach().numpy())])
            y_label.extend(list(labels.cpu().detach().numpy()))

        test_acc = accuracy_score(y_label, pred_label)
        test_precision = precision_score(y_label, pred_label)
        test_recall = recall_score(y_label, pred_label)
        test_auc = roc_auc_score(y_label, pred_label)
        test_f1 = f1_score(y_label, pred_label)
    return test_acc, test_auc, test_precision, test_recall, test_f1

In [23]:
# 输出结果(测试集)
test_acc, test_auc, test_precision, test_recall, test_f1 = testing(
    torch.load('JRMPM.model'), test_loader)
print('test_acc', test_acc)
print('test_precision', test_precision)
print('test_recall', test_precision)
print('test_f1', test_f1)
print('test_auc', test_auc)

test_acc 0.5002170138888888
test_precision 0.4953673373888629
test_recall 0.4953673373888629
test_f1 0.4337813473201115
test_auc 0.4993546274484371
