In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import jieba
import jieba.posseg as pseg
import re
from gensim.models import word2vec, Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
dataset = pd.read_csv('dataset_user_job_all_1.csv')

In [3]:
stop_words = [line.strip() for line in open('chinese_stopword.txt',encoding='UTF-8').readlines()]

def pretreatment(comment):

    token_words = jieba.lcut(comment)
    token_words = [w for w in token_words if w not in stop_words]
    token_words =  pseg.cut(' '.join(token_words))
    cleaned_word = []
    for word, tag in token_words:
        if word.isdigit():
            continue
        else:
            cleaned_word.append(word)
    return cleaned_word

In [4]:
segment_job =[]
# job_set=pd.read_csv('job_information.csv')
for content in tqdm(dataset["岗位描述"].values):
#     segment.append(pretreatment(content))
    segment_job.append(list(jieba.cut(content)))
dataset["text_job"] = segment_job
# job_set.to_csv("job_set_segment.csv",index=False)

  0%|          | 0/138238 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.823 seconds.
Prefix dict has been built successfully.
100%|██████████| 138238/138238 [02:23<00:00, 964.06it/s] 


In [5]:
segment_user = []
# job_set=pd.read_csv('job_information.csv')
for content in tqdm(dataset["resume"].values):
#     segment.append(pretreatment(content))
    segment_user.append(list(jieba.cut(content)))
dataset["text_user"] = segment_user
# user_set.to_csv("user_set_segment.csv",index=False)

100%|██████████| 138238/138238 [12:42<00:00, 181.24it/s]


In [6]:
# # 训练共享的word2vec模型
# text_array = np.concatenate((dataset.text_job.values,dataset.text_user.values),axis=0)
# w2v_model = word2vec.Word2Vec(text_array, size=200, window=5, min_count=2, workers=8, iter=10, sg=1)
# w2v_model.save('autodl-fs/word2vec_shared.model')

In [8]:
w2v_model = Word2Vec.load('autodl-fs/word2vec_shared.model')

In [9]:
word_present_list = list(w2v_model.wv.index2word)# 获取已转换的词
# word_present_list

In [10]:
dataset['text_user'] = dataset['text_user'].apply(lambda x:[i for i in x if i in word_present_list])# 去掉未转换的词

In [12]:
# 将所有word embedding的列平均值作为文本的表示
resume_vector_array = dataset['text_user'].apply(lambda x: np.mean([np.array(w2v_model.wv[i]).reshape(1,200) for i in x], axis=0))

In [14]:
resume_with_word2vec = np.zeros((1,200)) # 1*100的零矩阵
for idx in tqdm(range(len(resume_vector_array))):
    resume_with_word2vec = np.concatenate((resume_with_word2vec, resume_vector_array.values[idx]), axis=0)# 纵向合并
resume_with_word2vec = pd.DataFrame(resume_with_word2vec)

100%|██████████| 138238/138238 [3:02:38<00:00, 12.61it/s]  


In [15]:
dataset['text_job'] = dataset['text_job'].apply(lambda x:[i for i in x if i in word_present_list])# 去掉未转换的词
# 将所有word embedding的列平均值作为文本的表示
job_text_vector_array = dataset['text_job'].apply(lambda x: np.mean([np.array(w2v_model.wv[i]).reshape(1,200) for i in x], axis=0))

In [16]:
job_text_with_word2vec = np.zeros((1,200))
for idx in tqdm(range(len(job_text_vector_array))):
    try:
        job_text_with_word2vec = np.concatenate((job_text_with_word2vec, job_text_vector_array.values[idx]), axis=0)# 纵向合并
    except:
        job_text_with_word2vec = np.concatenate((job_text_with_word2vec, np.zeros((1,200))), axis=0)# 纵向合并
job_text_with_word2vec = pd.DataFrame(job_text_with_word2vec)

100%|██████████| 138238/138238 [3:06:22<00:00, 12.36it/s]  


In [20]:
dataset['emb_job'] = job_text_vector_array
dataset['emb_user'] = resume_vector_array

dataset.head(5)

Unnamed: 0,UserID,JobID,label,企业ID,企业行业一级类别,企业行业二级类别,企业行业三级类别,企业融资阶段,企业人员规模,企业上班时间,...,性别,学校,专业,学历,创建时间_y,更新时间_y,text_job,text_user,emb_job,emb_user
0,6862870633512964096,6860521459735859200,1,6860272998989959168,互联网/IT/电子/通信,互联网/IT/电子/通信,数据服务,不需要融资,1000~9999人,08:30,...,女,大连海事大学,法学,硕士,2021-11-25 21:02:04,2021-11-25 21:02:39,"[岗位职责, ：, \n, ·, , 业务学习, &, 任务, 处理, ：, , 严格,...","[教育, 背景, , Education, \n, 2015.09, , —, , 2...","[[0.013017808, -0.024077587, 0.11447856, 0.105...","[[0.06018763, -0.0059273904, 0.07602025, 0.037..."
1,6862870633512964096,6958017954889863168,0,6872143749984555008,互联网/IT/电子/通信,交通/物流/贸易/零售,交通/运输,未融资,0~20人,08:30,...,女,大连海事大学,法学,硕士,2021-11-25 21:02:04,2021-11-25 21:02:39,"[在, 海轮, 上, 工作, 的, 人员, 统称, 海员, 。, , 海员, 分, 两大类...","[教育, 背景, , Education, \n, 2015.09, , —, , 2...","[[0.17614882, 0.011191479, 0.0650312, 0.085126...","[[0.06018763, -0.0059273904, 0.07602025, 0.037..."
2,6863065345238765568,6860521458255269888,1,6860272998989959168,互联网/IT/电子/通信,互联网/IT/电子/通信,数据服务,不需要融资,1000~9999人,08:30,...,女,淮阴工学院,计算机应用技术,硕士,2021-11-26 09:55:47,2023-02-03 09:32:22,"[岗位职责, ：, \n, 1, ., , 响应, 用户, IT, 相关, 问题, 请求,...","[基本, 信息, \n, 姓, , 名, , ：, 张, , 发, \n, 出生年月,...","[[0.060279142, -0.067548156, 0.111521624, 0.11...","[[0.021132212, -0.083985, 0.12868516, 0.115934..."
3,6863065345238765568,6860644503477293056,1,6860297756443938816,互联网/IT/电子/通信,互联网/IT/电子/通信,计算机软件,未融资,100~499人,08:30,...,女,淮阴工学院,计算机应用技术,硕士,2021-11-26 09:55:47,2023-02-03 09:32:22,"[帮助, 客户, 公司, 进行, 信息化, 咨询, 。, 具体, 是, 通过, SAP, ...","[基本, 信息, \n, 姓, , 名, , ：, 张, , 发, \n, 出生年月,...","[[0.029140053, -0.0015383105, 0.17354803, 0.06...","[[0.021132212, -0.083985, 0.12868516, 0.115934..."
4,6863065345238765568,7032757799176114176,0,6872870554169249792,互联网/IT/电子/通信,服务业,酒店,不需要融资,100~499人,08:30,...,女,淮阴工学院,计算机应用技术,硕士,2021-11-26 09:55:47,2023-02-03 09:32:22,"[负责, 起草, 文书]","[基本, 信息, \n, 姓, , 名, , ：, 张, , 发, \n, 出生年月,...","[[0.2728341, 0.12886722, 0.00583907, 0.1525331...","[[0.021132212, -0.083985, 0.12868516, 0.115934..."


In [30]:
X_t = []
for i in range(0,len(dataset)):
    X_t.append(np.concatenate((dataset.loc[i,'emb_user'].tolist()[0],dataset.loc[i,'emb_job'].tolist()[0]), axis=0))

# X_t = np.concatenate(resume_vector_array,job_text_vector_array)

Y_t = dataset.label.values.tolist()

In [31]:
print(len(X_t))

138238


In [32]:
X_train, X_test,Y_train,Y_test = train_test_split(X_t,Y_t,test_size=0.2,random_state=20)

In [33]:
def show_result(y_true, y_prediction):
    print('acc:',accuracy_score(y_true, y_prediction))
    print('precision:',precision_score(y_true, y_prediction))
    print('recall:',recall_score(y_true, y_prediction))
    print('f1:',f1_score(y_true, y_prediction))
    print('auc:',roc_auc_score(y_true, y_prediction))

In [34]:
'''
    logistic regression
'''
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)
show_result(Y_test, y_pred)

acc: 0.6709707754629629
precision: 0.6744308759234132
recall: 0.6518761384335154
f1: 0.6629617279834018
auc: 0.670835002348985


In [35]:
'''
    GaussianNB
'''
nb = GaussianNB()
nb.fit(X_train, Y_train)
y_pred = nb.predict(X_test)
show_result(Y_test, y_pred)

acc: 0.5958839699074074
precision: 0.6583126550868487
recall: 0.3865938069216758
f1: 0.48712416800550845
auc: 0.594395804559739


In [36]:
'''
    DecisionTree
'''
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
y_pred = dt.predict(X_test)
show_result(Y_test, y_pred)

acc: 0.7226200810185185
precision: 0.7129994372537986
recall: 0.7385063752276867
f1: 0.7255287928134283
auc: 0.7227330410936968


In [37]:
'''
    RandomForest
'''
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)
show_result(Y_test, y_pred)

acc: 0.8285228587962963
precision: 0.8434250764525993
recall: 0.8037887067395264
f1: 0.8231300130572655
auc: 0.8283469857047484


In [38]:
'''
    AdaBoost
'''
ada = AdaBoostClassifier()
ada.fit(X_train, Y_train)
y_pred = ada.predict(X_test)
show_result(Y_test, y_pred)

acc: 0.6591073495370371
precision: 0.6638719512195121
recall: 0.634608378870674
f1: 0.648910411622276
auc: 0.6589331487113551


In [41]:
'''
    linearRegression
'''
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)
y_pred = [0 if i<0.5 else 1 for i in y_pred]
show_result(Y_test, y_pred)

acc: 0.6695601851851852
precision: 0.6764864241212214
recall: 0.6408014571948998
f1: 0.658160592681284
auc: 0.6693556951994754
