In [1]:
import json
import jieba
import re
import numpy as np
import pandas as pd
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

'''
1.不论是单列文本还是多列文本都可以通过这个函数统一获取tfidf_pca向量
2.给apply函数传入了参数，免去了global申明
'''

def load_csv_data(data_path):
    '''
    读取csv文件
    '''
    df = pd.read_csv(data_path)
    return df

def col_jieba_fun(series, col_name):
    '''
    将文本字符串切词成列表
    '''
    col = series[col_name]

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，|/| ", col)

    # 列表变字符串
    # 对于中文，进入jieba前不需要添加空格；不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)

    # 切词
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

def col_jieba_filter_fun(series, col_name_jieba):
    '''
    对切词后的列表进行过滤
    '''
    col_list_filter = []
    
    # 得到切词后的文本列表
    col_list = series[col_name_jieba]

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return " ".join(col_list_filter)

def get_tfidf(df, col_name):
    '''
    将文本列转成tfidf向量
    '''
    text = df[col_name]
    
    vectorizer = TfidfVectorizer()
    vector = vectorizer.fit_transform(text)
    return pd.DataFrame(vector.toarray()), vectorizer

def get_tfidf_pca(tfidf, n=20):
    '''
    将tfidf向量降维
    '''
    pca = PCA(n_components=n)
    tfidf_pca = pca.fit_transform(tfidf)
    tfidf_pca = pd.DataFrame(tfidf_pca)
    return tfidf_pca

def col_merge_fun(series, col_name_jieba_filter_list):
    '''
    合并多个文本列
    '''
    merge = ''
    for col in col_name_jieba_filter_list:
        merge = merge + series[col] + ' '
    return merge.strip(' ')

def get_tfidf_pca_from_text_cols(data_path, col_name_list, dimension):
    '''
    从多个文本列计算tfidf_pca

    :param data_path csv数据路径
    :param col_name_list 文本列列名列表
    :param dimension tfidf经过pca降维后的维度
    :returns: tfidf_pca向量
    '''
    # 读取csv文件
    df = load_csv_data(data_path)

    # 存储经过分词和过滤后的列名
    col_name_jieba_filter_list = []

    for col_name in col_name_list:

        col_name_jieba = col_name + '_jieba'
        col_name_jieba_filter = col_name_jieba + '_filter'
        col_name_jieba_filter_list.append(col_name_jieba_filter)

        # step1 空值填充
        df[col_name].fillna('', inplace=True)

        # step2 jieba分词
        df[col_name_jieba] = df.apply(col_jieba_fun, axis=1, args=(col_name, ))

        # step3 分词过滤
        df[col_name_jieba_filter] = df.apply(col_jieba_filter_fun, axis=1, args=(col_name_jieba, ))

        print("\n=================================={}==================================".format(col_name))
        print(df[[col_name, col_name_jieba, col_name_jieba_filter]])

    print(col_name_jieba_filter_list)
    
    merge_col_jieba_filter = "_".join(col_name_list) + '_jieba_filter'
    df[merge_col_jieba_filter] = df.apply(col_merge_fun, axis=1, args=(col_name_jieba_filter_list, ))

    print("\n=================================={}==================================".format('以上各列分词过滤后合并的新列'))
    print(df[[merge_col_jieba_filter]])

    # step4 得到tfidf
    tfidf, vectorizer = get_tfidf(df, merge_col_jieba_filter)
    print("\n=================================={}==================================".format('tfidf向量'))
    print(tfidf)

    # step5 得到tfidf_pca
    tfidf_pca = get_tfidf_pca(tfidf, dimension)
    print("\n=================================={}==================================".format('tfidf_pca向量'))
    print(tfidf_pca)

    return df, tfidf_pca


if __name__ == "__main__":
    print("running...")

    data_path = '../data/all_sample_20220821_spark.csv'
    num = 10
    
    print("\n从文本列获取tfidf_pca向量\n")
    col_name_list = ['title', 'category_name', 'tags']
    all_data, tfidf_pca = get_tfidf_pca_from_text_cols(data_path, col_name_list[:], dimension=10)

    print("all is well")


running...

从文本列获取tfidf_pca向量



  df = pd.read_csv(data_path)
Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/b0/f13r09ys4819g6vy91gl1_zr0000gn/T/jieba.cache
Loading model cost 0.457 seconds.
Prefix dict has been built successfully.



                 title                      title_jieba title_jieba_filter
0              市场营销负责人                      [市场营销, 负责人]           市场营销 负责人
1               会员管理经理                     [会员, 管理, 经理]           会员 管理 经理
2               产品市场经理                     [产品, 市场, 经理]           产品 市场 经理
3        海外区域运营经理/高级经理      [海外, 区域, 运营, 经理,  , 高级, 经理]  海外 区域 运营 经理 高级 经理
4               高级项目经理                       [高级, 项目经理]            高级 项目经理
...                ...                              ...                ...
81291             技术经理                         [技术, 经理]              技术 经理
81292           中央市场总监                       [中央, 市场总监]            中央 市场总监
81293           中央市场总监                       [中央, 市场总监]            中央 市场总监
81294  java开发工程师（不限方向）  [java, 开发, 工程师, （, 不, 限, 方向, ）]     java 开发 工程师 方向
81295               研发                             [研发]                 研发

[81296 rows x 3 columns]

      category_name category_name_jieba category_name_jieba_filter
0    

In [3]:
all_data.columns

Index(['Unnamed: 0', 'cv_id', 'jd_id', 'jd_code', 'title', 'deadline',
       'category_id', 'category_name', 'company_id', 'company_name',
       'recruit_number', 'work_age', 'min_annual_salary', 'max_annual_salary',
       'min_month_salary', 'max_month_salary', 'pay_months', 'created_by_name',
       'created_by', 'created_at', 'updated_by', 'updated_at', 'tenant_id',
       'guarantee_period', 'status', 'recommended_rcn', 'tags', 'workplaces',
       'description', 'school', 'degree', 'post_level_id', 'requirement', 'ds',
       '_id', 'additionInfo', 'age', 'certificates', 'code', 'competenceScore',
       'completeness', 'countryCode', 'createdAt', 'createdBy',
       'currentCompany', 'currentPosition', 'currentSalary', 'dateOfBirth',
       'degree.1', 'desiredIndustry', 'desiredPosition', 'desiredSalary',
       'dutyTime', 'expectLocation', 'gender', 'importType', 'industry',
       'interviewTime', 'jobIntentStatus', 'languageSkills', 'languages',
       'location', 'mainta

In [4]:
tfidf_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.042801,-0.158420,-0.004026,0.164216,-0.007686,0.003742,0.000511,-0.021473,-0.032958,-0.030909
1,-0.236844,0.085825,-0.230645,-0.025122,-0.002697,0.011322,0.020699,-0.038072,0.001349,-0.039086
2,-0.190874,0.194127,0.343109,0.055068,0.033935,0.041339,-0.054489,0.004275,-0.048443,0.074524
3,-0.372097,0.191212,-0.314994,-0.012852,-0.015042,0.001714,0.019897,0.001633,-0.029201,0.113210
4,0.005622,-0.057167,0.001178,-0.038613,0.003212,0.000648,-0.039050,-0.022315,-0.067211,-0.072346
...,...,...,...,...,...,...,...,...,...,...
81291,0.034487,-0.046805,0.049996,-0.045017,-0.024679,-0.012978,-0.034838,-0.015600,0.000759,-0.026918
81292,0.015164,-0.080298,0.009687,-0.010142,-0.032656,-0.000418,-0.043686,-0.023858,-0.044795,-0.034738
81293,-0.005333,-0.084557,0.012084,0.034595,-0.044242,0.004560,-0.039013,-0.022379,-0.049655,-0.018145
81294,0.547197,0.404913,-0.158576,0.200387,0.078450,0.038575,0.045299,-0.023784,0.010787,0.048172


In [5]:
# 注意一下，这里的列名和sample pipeline中的列名有一点大小写区别，问题不大
cv_columns = ['cv_id', 'currentPosition', 'desiredPosition', 'industry', 'desiredIndustry', 'majorName', 'skills', 'eduTracks', 'jobTracks', 'projectTracks']
jd_columns = ['jd_id', 'title', 'category_name', 'tags', 'description', 'requirement']

In [6]:
all_data[['title_category_name_tags_jieba_filter']]

Unnamed: 0,title_category_name_tags_jieba_filter
0,市场营销 负责人 市场营销 营销 管理 销售 战略规划 管理工作
1,会员 管理 经理 用户 运营 会员 运营 会员 管理 管理 营销 社群 运营
2,产品 市场 经理 产品 经理 excel office ppt ui
3,海外 区域 运营 经理 高级 经理 运营 管理 运营 团队 管理 电商 游戏
4,高级 项目经理 高级 管理 项目管理 推广 归档 风险 评估 交货
...,...
81291,技术 经理 机械设备 工程师 材料 考核 维修 技术标准 不锈钢
81292,中央 市场总监 市场总监 市场推广 医疗器械 语言 材料 营销
81293,中央 市场总监 市场 营销 市场推广 医疗器械 语言 材料 营销
81294,java 开发 工程师 方向 java java 服务 性能 优化 sql 架构 需求 分析


In [21]:
def get_cos_sim(v1, v2):
    '''
    获取余弦相似度特征
    '''
    if not v1 or not v2:
        return 0
    v1 = np.array(v1)
    v2 = np.array(v2)
    v1_norm = np.linalg.norm(v1)
    v2_norm = np.linalg.norm(v2)
    product = v1_norm * v2_norm
    if product == 0:
        return 0
    sim = np.dot(v1, v2) / product
    return sim

def get_text_3(df, col, num1, num2, num3):
    '''
    获取df的col列第num1、num2和num3条数据
    '''
    num_list = [num1, num2, num3]
    for num in num_list:
        text = df[col][num].split(" ")
        print("第{}条数据: {}".format(num+1, text))

def get_sim_2(df, col, tfidf_pca, num1, num2):
    '''
    计算df的col列的第num1和num2条数据的tfidf相似度
    '''
    v1 = list(tfidf_pca.iloc[num1])
    v2 = list(tfidf_pca.iloc[num2])
    sim_12 = get_cos_sim(v1, v2)

    print("{}_{}_{}:\t{}".format('sim', num1+1, num2+1, sim_12))

def get_sim_3(df, col, tfidf_pca, num1, num2, num3):
    '''
    计算df的col列的第num1、num2和num3条数据的tfidf相似度
    '''
    get_text_3(df, col, num1, num2, num3)
    get_sim_2(df, col, tfidf_pca, num1, num2)
    get_sim_2(df, col, tfidf_pca, num1, num3)
    get_sim_2(df, col, tfidf_pca, num2, num3)

col = 'title_category_name_tags_jieba_filter'
df = all_data

get_sim_3(df, col, tfidf_pca, 9, 2, 22)

第10条数据: ['专卖店', '店长', '门店', '店长', '管理', '调配', '信息', '收集', '销售', '管理', '门店', '运营', '管理']
第3条数据: ['产品', '市场', '经理', '产品', '经理', 'excel', 'office', 'ppt', 'ui']
第23条数据: ['产品', '上海', '产品', '经理', '架构', '市场', '分析', '产品', '研发', '产品', '规划', '产品', '运营']
sim_10_3:	-0.4111724088390137
sim_10_23:	-0.3731937456241913
sim_3_23:	0.9136420349179402
