In [1]:
import re
import json
import numpy as np
import pandas as pd
import jieba
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [2]:
# 读入20220821所有数据
all_data = pd.read_csv('../data/all_sample_20220821_spark.csv').drop(['Unnamed: 0'], axis=1)

  all_data = pd.read_csv('../data/all_sample_20220821_spark.csv').drop(['Unnamed: 0'], axis=1)


In [4]:
all_data.columns

Index(['cv_id', 'jd_id', 'jd_code', 'title', 'deadline', 'category_id',
       'category_name', 'company_id', 'company_name', 'recruit_number',
       'work_age', 'min_annual_salary', 'max_annual_salary',
       'min_month_salary', 'max_month_salary', 'pay_months', 'created_by_name',
       'created_by', 'created_at', 'updated_by', 'updated_at', 'tenant_id',
       'guarantee_period', 'status', 'recommended_rcn', 'tags', 'workplaces',
       'description', 'school', 'degree', 'post_level_id', 'requirement', 'ds',
       '_id', 'additionInfo', 'age', 'certificates', 'code', 'competenceScore',
       'completeness', 'countryCode', 'createdAt', 'createdBy',
       'currentCompany', 'currentPosition', 'currentSalary', 'dateOfBirth',
       'degree.1', 'desiredIndustry', 'desiredPosition', 'desiredSalary',
       'dutyTime', 'expectLocation', 'gender', 'importType', 'industry',
       'interviewTime', 'jobIntentStatus', 'languageSkills', 'languages',
       'location', 'maintainerId', 'majo

In [7]:
all_data[['work_age', 'workYear']]

Unnamed: 0,work_age,workYear
0,5-10年,13.0
1,3-5年,7.0
2,10年以上,14.0
3,3-5年,3.0
4,3-5年,5.0
...,...,...
81291,5-10年,1.0
81292,10年以上,23.0
81293,10年以上,23.0
81294,3-5年,5.0


## 理清需要用的特征


In [9]:
# 这些特征在sample pipeline的表里，但是现在无法下载
ext_columns = ['school_labels_cv', 'school_labels_jd', 'expectlocation_range_cv', 'location_range_jd', 'degree_index_cv', 'degree_index_jd', 'workyear_range_jd', 'gender_index_cv', 'job_intentstatus_cv']

# 这些特征在原始的cv和jd表里
raw_columns = ['age', 'recruit_number', 'workYear', 'min_annual_salary', 'max_annual_salary', 'pay_months', 'currentSalary', 'desiredSalary']

# 这些特征的过程已经有了，需要进一步拼接
new_columns = ['position_tfidf_pca', 'skills_tfidf_pca', 'jobTracks_tfidf_pca', 'title_category_tags_tfidf_pca', 'description_tfidf_pca', 'requirement_tfidf_pca', 'work_duration_mean', 'equal_words', 'equal_job']

# 将以上的特征合理分散在wide和deep侧
wide_columns = []
deep_columns = []

In [11]:
all_data[['cv_id', 'jd_id']+raw_columns]

Unnamed: 0,cv_id,jd_id,age,recruit_number,workYear,min_annual_salary,max_annual_salary,pay_months,currentSalary,desiredSalary
0,2c9207157bf91042017c0d2beef00470,2c9207157bcbb76b017bcde2b46d04cd,36.0,1,13.0,80.0,150.0,,,
1,2c9207157bf91042017c0d979cbb047d,2c9207157bcbb76b017bf7f2bbbb3153,29.0,1,7.0,28.0,35.0,,,
2,2c9207157bf91042017c7232fbab3be1,2c9207157bf94d99017c108f769405e0,,1,14.0,25.0,35.0,,,
3,2c9207157d308592017d4a94cb1113a1,2c9207157d2f0ec5017d5a9265ba33d9,27.0,1,3.0,60.0,500.0,,,
4,2c9207157dabb0a3017dbca16cdf4c26,2c9207157d631abb017d6eed1c79139f,28.0,1,5.0,,,12.0,保密,25 - 30k · 15薪
...,...,...,...,...,...,...,...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,8a69f6f8823a93b901823d891b761d51,41.0,1,1.0,,,12.0,6000,6千-8千/月
81292,8a69d7c282199453018247bac4a36859,8a69c468826ee9b40182831bedfc5e79,47.0,1,23.0,,,12.0,60+期权,100
81293,8a69d7c282199453018247bac4a36859,8a69f6f8823a93b9018247c30aca4e3d,47.0,1,23.0,,,13.0,60+期权,100
81294,8a69d7db826df6c201827b4338cd60c2,2c92071580dd3ab70180f18714e119c0,30.0,30,5.0,,,14.0,25K,28K


## position_tfidf_pca

In [12]:
# 将这个特征所需的列准备好
position_columns = ['cv_id', 'jd_id', 'currentPosition', 'desiredPosition']
all_data[position_columns]

Unnamed: 0,cv_id,jd_id,currentPosition,desiredPosition
0,2c9207157bf91042017c0d2beef00470,2c9207157bcbb76b017bcde2b46d04cd,副总裁/副总经理,
1,2c9207157bf91042017c0d979cbb047d,2c9207157bcbb76b017bf7f2bbbb3153,数字化营销经理,
2,2c9207157bf91042017c7232fbab3be1,2c9207157bf94d99017c108f769405e0,,
3,2c9207157d308592017d4a94cb1113a1,2c9207157d2f0ec5017d5a9265ba33d9,泰国国家经理,
4,2c9207157dabb0a3017dbca16cdf4c26,2c9207157d631abb017d6eed1c79139f,项目经理,项目经理/主管
...,...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,8a69f6f8823a93b901823d891b761d51,生产工、包装工,生产制造/机械设计/制造/机械设备工程师
81292,8a69d7c282199453018247bac4a36859,8a69c468826ee9b40182831bedfc5e79,高级市场总监,市场/市场/营销/市场营销
81293,8a69d7c282199453018247bac4a36859,8a69f6f8823a93b9018247c30aca4e3d,高级市场总监,市场/市场/营销/市场营销
81294,8a69d7db826df6c201827b4338cd60c2,2c92071580dd3ab70180f18714e119c0,java,技术/后端开发/Java


In [13]:
import json
import jieba
import re
import numpy as np
import pandas as pd
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

'''
1.不论是单列文本还是多列文本都可以通过这个函数统一获取tfidf_pca向量
2.给apply函数传入了参数，免去了global申明
'''

def load_csv_data(data_path):
    '''
    读取csv文件
    '''
    df = pd.read_csv(data_path)
    return df

def col_jieba_fun(series, col_name):
    '''
    将文本字符串切词成列表
    '''
    col = series[col_name]
    #print(col)
    # 加入特例判断 *Tracks。'[{},{}]', json无法解析。
    if col_name.endswith("Tracks"):
        col_list = jieba.lcut(col, cut_all=False)
        return col_list

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，|/| ", col)

    # 列表变字符串
    # 对于中文，进入jieba前不需要添加空格；不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)

    # 切词
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

def col_jieba_filter_fun(series, col_name_jieba):
    '''
    对切词后的列表进行过滤
    '''
    col_list_filter = []
    
    # 得到切词后的文本列表
    col_list = series[col_name_jieba]

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return " ".join(col_list_filter)

def get_tfidf(df, col_name):
    '''
    将文本列转成tfidf向量
    '''
    text = df[col_name]
    
    vectorizer = TfidfVectorizer()
    vector = vectorizer.fit_transform(text)
    return pd.DataFrame(vector.toarray()), vectorizer

def get_tfidf_pca(tfidf, n=20):
    '''
    将tfidf向量降维
    '''
    pca = PCA(n_components=n)
    tfidf_pca = pca.fit_transform(tfidf)
    tfidf_pca = pd.DataFrame(tfidf_pca)
    return tfidf_pca

def col_merge_fun(series, col_name_jieba_filter_list):
    '''
    合并多个文本列
    '''
    merge = ''
    for col in col_name_jieba_filter_list:
        merge = merge + series[col] + ' '
    return merge.strip(' ')

def get_tfidf_pca_from_text_cols(data_path, col_name_list, dimension):
    '''
    从多个文本列计算tfidf_pca

    :param data_path csv数据路径
    :param col_name_list 文本列列名列表
    :param dimension tfidf经过pca降维后的维度
    :returns: tfidf_pca向量
    '''
    # 读取csv文件
    df = load_csv_data(data_path)

    # 存储经过分词和过滤后的列名
    col_name_jieba_filter_list = []

    for col_name in col_name_list:

        col_name_jieba = col_name + '_jieba'
        col_name_jieba_filter = col_name_jieba + '_filter'
        col_name_jieba_filter_list.append(col_name_jieba_filter)

        # step1 空值填充
        df[col_name].fillna('', inplace=True)

        # step2 jieba分词
        df[col_name_jieba] = df.apply(col_jieba_fun, axis=1, args=(col_name, ))

        # step3 分词过滤
        df[col_name_jieba_filter] = df.apply(col_jieba_filter_fun, axis=1, args=(col_name_jieba, ))

        print("\n=================================={}==================================".format(col_name))
        print(df[[col_name, col_name_jieba, col_name_jieba_filter]])

    print(col_name_jieba_filter_list)
    
    merge_col_jieba_filter = "_".join(col_name_list) + '_jieba_filter'
    df[merge_col_jieba_filter] = df.apply(col_merge_fun, axis=1, args=(col_name_jieba_filter_list, ))

    print("\n=================================={}==================================".format('以上各列分词过滤后合并的新列'))
    print(df[[merge_col_jieba_filter]])

    # step4 得到tfidf
    tfidf, vectorizer = get_tfidf(df, merge_col_jieba_filter)
    print("\n=================================={}==================================".format('tfidf向量'))
    print(tfidf)

    # step5 得到tfidf_pca
    tfidf_pca = get_tfidf_pca(tfidf, dimension)
    print("\n=================================={}==================================".format('tfidf_pca向量'))
    print(tfidf_pca)

    return tfidf_pca


if __name__ == "__main__":
    print("running...")

    data_path = '../data/all_sample_20220821_spark.csv'
    dimension = 20
    
    print("\n从文本列获取tfidf_pca向量\n")
    col_name_list = ['title', 'category_name', 'tags']
    col_name_list = ['jobTracks']
    col_name_list = ['projectTracks']
    col_name_list = ['tags']
    col_name_list = ['skills']
    col_name_list = ['currentPosition', 'desiredPosition']

    tfidf_pca = get_tfidf_pca_from_text_cols(data_path, col_name_list, dimension=20)

    print("all is well")

'''
jd可以做3个向量
title + category_name + tags
description
requirement

cv可以做4个向量：
currentPosition + desiredPosition
skills
jobTracks
projectTracks
'''

running...

从文本列获取tfidf_pca向量



  df = pd.read_csv(data_path)
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/b0/f13r09ys4819g6vy91gl1_zr0000gn/T/jieba.cache
Loading model cost 0.457 seconds.
Prefix dict has been built successfully.



      currentPosition currentPosition_jieba currentPosition_jieba_filter
0            副总裁/副总经理        [副总裁,  , 副总经理]                     副总裁 副总经理
1             数字化营销经理         [数字化, 营销, 经理]                    数字化 营销 经理
2                                        []                             
3              泰国国家经理          [泰国, 国家, 经理]                     泰国 国家 经理
4                项目经理                [项目经理]                         项目经理
...               ...                   ...                          ...
81291         生产工、包装工       [生产, 工, 、, 包装工]                       生产 包装工
81292          高级市场总监            [高级, 市场总监]                      高级 市场总监
81293          高级市场总监            [高级, 市场总监]                      高级 市场总监
81294            java                [java]                         java
81295        零售营销部负责人        [零售, 营销部, 负责人]                   零售 营销部 负责人

[81296 rows x 3 columns]

            desiredPosition                   desiredPosition_jieba  \
0                        

'\njd可以做3个向量\ntitle + category_name + tags\ndescription\nrequirement\n\ncv可以做4个向量：\ncurrentPosition + desiredPosition\nskills\njobTracks\nprojectTracks\n'

In [33]:
# 20唯
position_tfidf_pca = tfidf_pca
position_tfidf_pca

# 需要将这个向量变成一个列表
def tfidf_pca_merge_fun(series):
    return list(series)

position_tfidf_pca['position_tfidf_pca'] = position_tfidf_pca.apply(tfidf_pca_merge_fun, axis=1)
position_tfidf_pca['position_tfidf_pca']


0        [-0.03932779682863947, -0.11565172900395955, -...
1        [0.10541808365196882, -0.026397129197533745, -...
2        [-0.03681507687696504, -0.08686209108741907, -...
3        [0.0490419828744077, -0.04122137464599996, -0....
4        [-0.02710200317143146, -0.11320120882895608, -...
                               ...                        
81291    [-0.07459616361823553, -0.0529118767065453, -0...
81292    [-0.024877086226121278, -0.08187562326039885, ...
81293    [-0.024877086226121278, -0.08187562326039885, ...
81294    [-0.38493394737698006, 0.4677351612593016, 0.3...
81295    [0.0449566435757599, -0.2254351027228728, 0.46...
Name: position_tfidf_pca, Length: 81296, dtype: object

In [40]:
# 只拼接position_tfidf_pca列
sample = pd.concat([all_data, position_tfidf_pca[['position_tfidf_pca']]], axis=1)
sample[['cv_id', 'jd_id', 'position_tfidf_pca']]

Unnamed: 0,cv_id,jd_id,position_tfidf_pca
0,2c9207157bf91042017c0d2beef00470,2c9207157bcbb76b017bcde2b46d04cd,"[-0.03932779682863947, -0.11565172900395955, -..."
1,2c9207157bf91042017c0d979cbb047d,2c9207157bcbb76b017bf7f2bbbb3153,"[0.10541808365196882, -0.026397129197533745, -..."
2,2c9207157bf91042017c7232fbab3be1,2c9207157bf94d99017c108f769405e0,"[-0.03681507687696504, -0.08686209108741907, -..."
3,2c9207157d308592017d4a94cb1113a1,2c9207157d2f0ec5017d5a9265ba33d9,"[0.0490419828744077, -0.04122137464599996, -0...."
4,2c9207157dabb0a3017dbca16cdf4c26,2c9207157d631abb017d6eed1c79139f,"[-0.02710200317143146, -0.11320120882895608, -..."
...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,8a69f6f8823a93b901823d891b761d51,"[-0.07459616361823553, -0.0529118767065453, -0..."
81292,8a69d7c282199453018247bac4a36859,8a69c468826ee9b40182831bedfc5e79,"[-0.024877086226121278, -0.08187562326039885, ..."
81293,8a69d7c282199453018247bac4a36859,8a69f6f8823a93b9018247c30aca4e3d,"[-0.024877086226121278, -0.08187562326039885, ..."
81294,8a69d7db826df6c201827b4338cd60c2,2c92071580dd3ab70180f18714e119c0,"[-0.38493394737698006, 0.4677351612593016, 0.3..."


## skills_tfidf_pca

In [42]:
import json
import jieba
import re
import numpy as np
import pandas as pd
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

'''
1.不论是单列文本还是多列文本都可以通过这个函数统一获取tfidf_pca向量
2.给apply函数传入了参数，免去了global申明
'''

def load_csv_data(data_path):
    '''
    读取csv文件
    '''
    df = pd.read_csv(data_path)
    return df

def col_jieba_fun(series, col_name):
    '''
    将文本字符串切词成列表
    '''
    col = series[col_name]
    #print(col)
    # 加入特例判断 *Tracks。'[{},{}]', json无法解析。
    if col_name.endswith("Tracks"):
        col_list = jieba.lcut(col, cut_all=False)
        return col_list

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，|/| ", col)

    # 列表变字符串
    # 对于中文，进入jieba前不需要添加空格；不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)

    # 切词
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

def col_jieba_filter_fun(series, col_name_jieba):
    '''
    对切词后的列表进行过滤
    '''
    col_list_filter = []
    
    # 得到切词后的文本列表
    col_list = series[col_name_jieba]

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return " ".join(col_list_filter)

def get_tfidf(df, col_name):
    '''
    将文本列转成tfidf向量
    '''
    text = df[col_name]
    
    vectorizer = TfidfVectorizer()
    vector = vectorizer.fit_transform(text)
    return pd.DataFrame(vector.toarray()), vectorizer

def get_tfidf_pca(tfidf, n=20):
    '''
    将tfidf向量降维
    '''
    pca = PCA(n_components=n)
    tfidf_pca = pca.fit_transform(tfidf)
    tfidf_pca = pd.DataFrame(tfidf_pca)
    return tfidf_pca

def col_merge_fun(series, col_name_jieba_filter_list):
    '''
    合并多个文本列
    '''
    merge = ''
    for col in col_name_jieba_filter_list:
        merge = merge + series[col] + ' '
    return merge.strip(' ')

def get_tfidf_pca_from_text_cols(data_path, col_name_list, dimension):
    '''
    从多个文本列计算tfidf_pca

    :param data_path csv数据路径
    :param col_name_list 文本列列名列表
    :param dimension tfidf经过pca降维后的维度
    :returns: tfidf_pca向量
    '''
    # 读取csv文件
    df = load_csv_data(data_path)

    # 存储经过分词和过滤后的列名
    col_name_jieba_filter_list = []

    for col_name in col_name_list:

        col_name_jieba = col_name + '_jieba'
        col_name_jieba_filter = col_name_jieba + '_filter'
        col_name_jieba_filter_list.append(col_name_jieba_filter)

        # step1 空值填充
        df[col_name].fillna('', inplace=True)

        # step2 jieba分词
        df[col_name_jieba] = df.apply(col_jieba_fun, axis=1, args=(col_name, ))

        # step3 分词过滤
        df[col_name_jieba_filter] = df.apply(col_jieba_filter_fun, axis=1, args=(col_name_jieba, ))

        print("\n=================================={}==================================".format(col_name))
        print(df[[col_name, col_name_jieba, col_name_jieba_filter]])

    print(col_name_jieba_filter_list)
    
    merge_col_jieba_filter = "_".join(col_name_list) + '_jieba_filter'
    df[merge_col_jieba_filter] = df.apply(col_merge_fun, axis=1, args=(col_name_jieba_filter_list, ))

    print("\n=================================={}==================================".format('以上各列分词过滤后合并的新列'))
    print(df[[merge_col_jieba_filter]])

    # step4 得到tfidf
    tfidf, vectorizer = get_tfidf(df, merge_col_jieba_filter)
    print("\n=================================={}==================================".format('tfidf向量'))
    print(tfidf)

    # step5 得到tfidf_pca
    tfidf_pca = get_tfidf_pca(tfidf, dimension)
    print("\n=================================={}==================================".format('tfidf_pca向量'))
    print(tfidf_pca)

    return tfidf_pca


if __name__ == "__main__":
    print("running...")

    data_path = '../data/all_sample_20220821_spark.csv'
    dimension = 20
    
    print("\n从文本列获取tfidf_pca向量\n")
    col_name_list = ['title', 'category_name', 'tags']
    col_name_list = ['jobTracks']
    col_name_list = ['projectTracks']
    col_name_list = ['tags']
    col_name_list = ['skills']
    #col_name_list = ['currentPosition', 'desiredPosition']

    tfidf_pca = get_tfidf_pca_from_text_cols(data_path, col_name_list, dimension=10)

    print("all is well")

'''
jd可以做3个向量
title + category_name + tags
description
requirement

cv可以做4个向量：
currentPosition + desiredPosition
skills
jobTracks
projectTracks
'''

running...

从文本列获取tfidf_pca向量



  df = pd.read_csv(data_path)



                                                  skills  \
0      ["Ipd","App","Kpi","Crm","View","优化","策划","营销"...   
1      ["H5","C1","Seo","Sem","Top","Kpi","Crm","优化",...   
2      ["Based","Control","Sap Crm","Improve","Indivi...   
3      ["App","Resume","运营","策划","营销","搭建","笔译","舞蹈",...   
4                                               ["智能硬件"]   
...                                                  ...   
81291                                                 电气   
81292    搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写   
81293    搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写   
81294  Bi,Cdn,Elk,Git,Yii,App,Php,Ext,Etl,Java,Sina,H...   
81295  电商,优化,策划,运营,客服,搭建,营销,数据分析,电子商务,运营管理,管理工作,团队建设,...   

                                            skills_jieba  \
0      [Ipd,  , App,  , Kpi,  , Crm,  , View,  , 优化, ...   
1      [H5,  , C1,  , Seo,  , Sem,  , Top,  , Kpi,  ,...   
2      [Based,  , Control,  , Sap,  , Crm,  , Improve...   
3      [App,  , Resume,  , 运营,  , 策划, 

'\njd可以做3个向量\ntitle + category_name + tags\ndescription\nrequirement\n\ncv可以做4个向量：\ncurrentPosition + desiredPosition\nskills\njobTracks\nprojectTracks\n'

In [45]:
# 20唯
skills_tfidf_pca = tfidf_pca
skills_tfidf_pca

# 需要将这个向量变成一个列表
def tfidf_pca_merge_fun(series):
    return list(series)

skills_tfidf_pca['skills_tfidf_pca'] = skills_tfidf_pca.apply(tfidf_pca_merge_fun, axis=1)
skills_tfidf_pca['skills_tfidf_pca']

0        [0.3299248970256854, 0.16474194653819885, 0.18...
1        [0.23650593277306853, 0.11367907127578263, -0....
2        [-0.09153947763696771, -0.14099754811546666, -...
3        [0.14298766341069355, 0.021942123977335692, -0...
4        [-0.09929616983161603, -0.14959755158004803, -...
                               ...                        
81291    [-0.09862740280020853, -0.18200217236689012, -...
81292    [0.015651374677652093, 0.023829744008932712, -...
81293    [0.015651374677652093, 0.023829744008932712, -...
81294    [-0.25296499176235515, 0.15891949257963553, 0....
81295    [0.35421402423961174, 0.08579297978361765, -0....
Name: skills_tfidf_pca, Length: 81296, dtype: object

In [46]:
# 只拼接position_tfidf_pca列
sample = pd.concat([all_data, skills_tfidf_pca[['skills_tfidf_pca']]], axis=1)
sample[['cv_id', 'jd_id', 'skills_tfidf_pca']]

Unnamed: 0,cv_id,jd_id,skills_tfidf_pca
0,2c9207157bf91042017c0d2beef00470,2c9207157bcbb76b017bcde2b46d04cd,"[0.3299248970256854, 0.16474194653819885, 0.18..."
1,2c9207157bf91042017c0d979cbb047d,2c9207157bcbb76b017bf7f2bbbb3153,"[0.23650593277306853, 0.11367907127578263, -0...."
2,2c9207157bf91042017c7232fbab3be1,2c9207157bf94d99017c108f769405e0,"[-0.09153947763696771, -0.14099754811546666, -..."
3,2c9207157d308592017d4a94cb1113a1,2c9207157d2f0ec5017d5a9265ba33d9,"[0.14298766341069355, 0.021942123977335692, -0..."
4,2c9207157dabb0a3017dbca16cdf4c26,2c9207157d631abb017d6eed1c79139f,"[-0.09929616983161603, -0.14959755158004803, -..."
...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,8a69f6f8823a93b901823d891b761d51,"[-0.09862740280020853, -0.18200217236689012, -..."
81292,8a69d7c282199453018247bac4a36859,8a69c468826ee9b40182831bedfc5e79,"[0.015651374677652093, 0.023829744008932712, -..."
81293,8a69d7c282199453018247bac4a36859,8a69f6f8823a93b9018247c30aca4e3d,"[0.015651374677652093, 0.023829744008932712, -..."
81294,8a69d7db826df6c201827b4338cd60c2,2c92071580dd3ab70180f18714e119c0,"[-0.25296499176235515, 0.15891949257963553, 0...."


## jobTracks_tfidf_pca

In [47]:
import json
import jieba
import re
import numpy as np
import pandas as pd
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

'''
1.不论是单列文本还是多列文本都可以通过这个函数统一获取tfidf_pca向量
2.给apply函数传入了参数，免去了global申明
'''

def load_csv_data(data_path):
    '''
    读取csv文件
    '''
    df = pd.read_csv(data_path)
    return df

def col_jieba_fun(series, col_name):
    '''
    将文本字符串切词成列表
    '''
    col = series[col_name]
    #print(col)
    # 加入特例判断 *Tracks。'[{},{}]', json无法解析。
    if col_name.endswith("Tracks"):
        col_list = jieba.lcut(col, cut_all=False)
        return col_list

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，|/| ", col)

    # 列表变字符串
    # 对于中文，进入jieba前不需要添加空格；不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)

    # 切词
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

def col_jieba_filter_fun(series, col_name_jieba):
    '''
    对切词后的列表进行过滤
    '''
    col_list_filter = []
    
    # 得到切词后的文本列表
    col_list = series[col_name_jieba]

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return " ".join(col_list_filter)

def get_tfidf(df, col_name):
    '''
    将文本列转成tfidf向量
    '''
    text = df[col_name]
    
    vectorizer = TfidfVectorizer()
    vector = vectorizer.fit_transform(text)
    return pd.DataFrame(vector.toarray()), vectorizer

def get_tfidf_pca(tfidf, n=20):
    '''
    将tfidf向量降维
    '''
    pca = PCA(n_components=n)
    tfidf_pca = pca.fit_transform(tfidf)
    tfidf_pca = pd.DataFrame(tfidf_pca)
    return tfidf_pca

def col_merge_fun(series, col_name_jieba_filter_list):
    '''
    合并多个文本列
    '''
    merge = ''
    for col in col_name_jieba_filter_list:
        merge = merge + series[col] + ' '
    return merge.strip(' ')

def get_tfidf_pca_from_text_cols(data_path, col_name_list, dimension):
    '''
    从多个文本列计算tfidf_pca

    :param data_path csv数据路径
    :param col_name_list 文本列列名列表
    :param dimension tfidf经过pca降维后的维度
    :returns: tfidf_pca向量
    '''
    # 读取csv文件
    df = load_csv_data(data_path)

    # 存储经过分词和过滤后的列名
    col_name_jieba_filter_list = []

    for col_name in col_name_list:

        col_name_jieba = col_name + '_jieba'
        col_name_jieba_filter = col_name_jieba + '_filter'
        col_name_jieba_filter_list.append(col_name_jieba_filter)

        # step1 空值填充
        df[col_name].fillna('', inplace=True)

        # step2 jieba分词
        df[col_name_jieba] = df.apply(col_jieba_fun, axis=1, args=(col_name, ))

        # step3 分词过滤
        df[col_name_jieba_filter] = df.apply(col_jieba_filter_fun, axis=1, args=(col_name_jieba, ))

        print("\n=================================={}==================================".format(col_name))
        print(df[[col_name, col_name_jieba, col_name_jieba_filter]])

    print(col_name_jieba_filter_list)
    
    merge_col_jieba_filter = "_".join(col_name_list) + '_jieba_filter'
    df[merge_col_jieba_filter] = df.apply(col_merge_fun, axis=1, args=(col_name_jieba_filter_list, ))

    print("\n=================================={}==================================".format('以上各列分词过滤后合并的新列'))
    print(df[[merge_col_jieba_filter]])

    # step4 得到tfidf
    tfidf, vectorizer = get_tfidf(df, merge_col_jieba_filter)
    print("\n=================================={}==================================".format('tfidf向量'))
    print(tfidf)

    # step5 得到tfidf_pca
    tfidf_pca = get_tfidf_pca(tfidf, dimension)
    print("\n=================================={}==================================".format('tfidf_pca向量'))
    print(tfidf_pca)

    return tfidf_pca


if __name__ == "__main__":
    print("running...")

    data_path = '../data/all_sample_20220821_spark.csv'
    dimension = 20
    
    print("\n从文本列获取tfidf_pca向量\n")
    col_name_list = ['title', 'category_name', 'tags']
    
    col_name_list = ['projectTracks']
    col_name_list = ['tags']
    col_name_list = ['skills']
    col_name_list = ['jobTracks']
    #col_name_list = ['currentPosition', 'desiredPosition']

    tfidf_pca = get_tfidf_pca_from_text_cols(data_path, col_name_list, dimension=30)

    print("all is well")

'''
jd可以做3个向量
title + category_name + tags
description
requirement

cv可以做4个向量：
currentPosition + desiredPosition
skills
jobTracks
projectTracks
'''

running...

从文本列获取tfidf_pca向量



  df = pd.read_csv(data_path)


## 数据集划分，剔除复推集