In [4]:
import json
import jieba
import re
import numpy as np
import pandas as pd
from zhconv import convert


cv_columns = ['cv_id', 'currentPosition', 'desiredPosition', 'industry', 'desiredIndustry', 'majorName', 'skills', 'eduTracks', 'jobTracks', 'projectTracks']
jd_columns = ['jd_id', 'title', 'category_name', 'tags', 'description', 'requirement']


def load_csv_data(data_path):
    '''
    读取csv文件
    '''
    df = pd.read_csv(data_path)
    return df

def col_merge_fun(series, col_list):
    '''
    合并多个文本列
    '''
    merge = ''
    for col in col_list:
        merge = merge + series[col] + ' '
    return merge.strip(' ')

def col_jieba_fun(series, col_name):
    '''
    将文本字符串切词成列表
    '''
    text = series[col_name]
    
    # 切词，精确模式
    col_list = jieba.lcut(text, cut_all=False)
    return col_list

def col_jieba_filter_fun(series, col_name_jieba):
    '''
    对切词后的列表进行过滤
    '''
    col_list_filter = []
    
    # 得到切词后的文本列表
    col_list = series[col_name_jieba]

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return col_list_filter

def get_text_jieba_filter(data_path):
    '''
    给定csv数据路径，分别将cv和jd的文本列合并、分词、过滤
    '''
    all_data = load_csv_data(data_path).iloc[:]

    # 空值填充
    for col in cv_columns[1:]:
        all_data[col].fillna('', inplace=True)
    for col in jd_columns[1:]:
        all_data[col].fillna('', inplace=True)

    cv_jd = ['cv', 'jd']
    cv_jd_columns = [cv_columns, jd_columns]

    for col, col_columns in zip(cv_jd, cv_jd_columns):
        col_text = col + '_text'
        col_text_jieba = col_text + '_jieba'
        col_text_jieba_filter = col_text_jieba + '_filter'
        
        all_data[col_text] = all_data.apply(col_merge_fun, axis=1, args=(col_columns[1:], ))
        all_data[col_text_jieba] = all_data.apply(col_jieba_fun, axis=1, args=(col_text, ))
        all_data[col_text_jieba_filter] = all_data.apply(col_jieba_filter_fun, axis=1, args=(col_text_jieba, ))

    return all_data

def get_equal_word_num(series, col_list):

    pass

if __name__ == "__main__":
    print("running...")

    data_path = '../data/all_sample_20220821_spark.csv'
    all_data = get_text_jieba_filter(data_path)


    print("all is well!")





running...


  df = pd.read_csv(data_path)


KeyboardInterrupt: 

In [3]:
all_data[['cv_text_jieba_filter', 'jd_text_jieba_filter']]

Unnamed: 0,cv_text_jieba_filter,jd_text_jieba_filter
0,"[副总裁, 副总经理, 国际, 经济, 贸易, ipd, app, kpi, crm, vi...","[市场营销, 负责人, 市场营销, 营销, 管理, 销售, 战略规划, 管理工作, 岗位职责..."
1,"[数字化, 营销, 经理, 餐饮业, 电气工程, 及其, 自动化, h5, c1, seo,...","[会员, 管理, 经理, 用户, 运营, 会员, 运营, 会员, 管理, 管理, 营销, 社..."
2,"[marketing, based, control, sap, crm, improve,...","[产品, 市场, 经理, 产品, 经理, excel, office, ppt, ui, s..."
3,"[泰国, 国家, 经理, 企业, 管理, app, resume, 运营, 策划, 营销, ...","[海外, 区域, 运营, 经理, 高级, 经理, 运营, 管理, 运营, 团队, 管理, 电..."
4,"[项目经理, 项目经理, 主管, 互联网, 智能, 硬件, 互联网, 英语, 智能, 硬件,...","[高级, 项目经理, 高级, 管理, 项目管理, 推广, 归档, 风险, 评估, 交货, 负..."
...,...,...
95,"[二次元, 游戏, 模块, 运营, 负责人, 平台, 运营, 游戏, 运营, 工商管理, a...","[高级, 资深, 平台, 产品, 运营, 游戏, 方向, 产品, 运营, 游戏, 运营, 平..."
96,"[二次元, 游戏, 模块, 运营, 负责人, 平台, 运营, 游戏, 运营, 工商管理, a...","[品牌, 营销, 专家, 运营, 品牌, 运营, 游戏, 品牌, 营销, 营销, bi, 负..."
97,"[创意, 专家, 视觉, 传达, group, resume, 营销, 功能, 策划, 街舞...","[品牌, 创意, 总监, 品牌, 公关, 管理, 沟通, 能力, 销售, 营销, 品牌, 推..."
98,"[上海证券交易所, 技术, 总监, 通信, 信息系统, ui, app, erp, saas...","[功能, 安全, 工程师, 技术, 芯片, 功能, 安全, 测试, 审核, 项目管理, 架构..."


In [1]:
import re
import json
import numpy as np
import pandas as pd
import jieba
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [2]:
# 读入20220821所有数据
all_data = pd.read_csv('../data/all_sample_20220821_spark.csv').drop(['Unnamed: 0'], axis=1)

  all_data = pd.read_csv('../data/all_sample_20220821_spark.csv').drop(['Unnamed: 0'], axis=1)


In [3]:
all_data.columns

Index(['cv_id', 'jd_id', 'jd_code', 'title', 'deadline', 'category_id',
       'category_name', 'company_id', 'company_name', 'recruit_number',
       'work_age', 'min_annual_salary', 'max_annual_salary',
       'min_month_salary', 'max_month_salary', 'pay_months', 'created_by_name',
       'created_by', 'created_at', 'updated_by', 'updated_at', 'tenant_id',
       'guarantee_period', 'status', 'recommended_rcn', 'tags', 'workplaces',
       'description', 'school', 'degree', 'post_level_id', 'requirement', 'ds',
       '_id', 'additionInfo', 'age', 'certificates', 'code', 'competenceScore',
       'completeness', 'countryCode', 'createdAt', 'createdBy',
       'currentCompany', 'currentPosition', 'currentSalary', 'dateOfBirth',
       'degree.1', 'desiredIndustry', 'desiredPosition', 'desiredSalary',
       'dutyTime', 'expectLocation', 'gender', 'importType', 'industry',
       'interviewTime', 'jobIntentStatus', 'languageSkills', 'languages',
       'location', 'maintainerId', 'majo

黄金数据集数值特征
* EqualWord cv文本和jd文本中，重合的关键词数目
* work_duration cv平均每份工作的平均时间
* salary 

深度模型缺失值的填充！

‘CVs’: 包括CV中recommendReason、cvKeyword、jobTracks的三类文本

‘JDs’: 包括JD中jdKeyword、description、requirement的三类文本

EqualWord: CVs文本和JDs文本中，重合的关键词数目 （表示匹配程度，用于wide）

In [8]:
# 这里是cv和jd可用的文本类特征了
cv_columns = ['cv_id', 'currentPosition', 'desiredPosition', 'industry', 'desiredIndustry', 'majorName', 'skills', 'eduTracks', 'jobTracks', 'projectTracks']
jd_columns = ['jd_id', 'title', 'category_name', 'tags', 'description', 'requirement']

对于上述cv文本字段的一些看法:
* industry和desiredIndustry的缺失值比较严重
* majorName感觉字段错位，里面有很多学校信息.建议重新入库。 

In [9]:
cv_data = all_data[cv_columns]
cv_data

Unnamed: 0,cv_id,currentPosition,desiredPosition,industry,desiredIndustry,majorName,skills,eduTracks,jobTracks,projectTracks
0,2c9207157bf91042017c0d2beef00470,副总裁/副总经理,,,,国际经济与贸易,"[""Ipd"",""App"",""Kpi"",""Crm"",""View"",""优化"",""策划"",""营销""...","['{""degree"": ""本科"", ""endDate"": ""2008-07"", ""id"":...","['{""companyName"": ""深圳市亿科数字科技有限公司"", ""descriptio...","['{""companyName"": ""*深圳市蜂联科技有限公司（奇虎360）"", ""desc..."
1,2c9207157bf91042017c0d979cbb047d,数字化营销经理,,餐饮业,,电气工程及其自动化,"[""H5"",""C1"",""Seo"",""Sem"",""Top"",""Kpi"",""Crm"",""优化"",...","['{""degree"": ""本科"", ""endDate"": ""2014-06"", ""id"":...","['{""companyName"": ""北京美餐好客科技有限公司"", ""companyNatu...","['{""description"": ""项目描述: 母婴类人群，包含童装、童鞋、奶粉。\\n运..."
2,2c9207157bf91042017c7232fbab3be1,,,,,Marketing,"[""Based"",""Control"",""Sap Crm"",""Improve"",""Indivi...","['{""degree"": ""硕士"", ""endDate"": ""2012-09"", ""id"":...","['{""companyName"": ""Continental Tires (China) L...",
3,2c9207157d308592017d4a94cb1113a1,泰国国家经理,,,,企业管理,"[""App"",""Resume"",""运营"",""策划"",""营销"",""搭建"",""笔译"",""舞蹈"",...","['{""degree"": ""硕士"", ""endDate"": ""2019-06"", ""id"":...","['{""companyName"": ""北京星制科技有限公司（全民快乐）"", ""descrip...","['{""description"": ""主播表现，直播平均 2 场，平台新用户 12926，主..."
4,2c9207157dabb0a3017dbca16cdf4c26,项目经理,项目经理/主管,互联网+,"智能硬件,互联网+",英语,"[""智能硬件""]","['{""degree"": ""硕士"", ""endDate"": ""2016-07"", ""id"":...","['{""companyName"": ""酷狗音乐"", ""companyTags"": ""互联网+...","['{""companyName"": ""酷狗音乐"", ""description"": ""酷狗20..."
...,...,...,...,...,...,...,...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,生产工、包装工,生产制造/机械设计/制造/机械设备工程师,,,无,电气,"['{""endDate"": ""2022-07"", ""unified"": false, ""de...","['{""endDate"": ""2022-07"", ""companyName"": ""江门市新会...","['{""name"": null}']"
81292,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']"
81293,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']"
81294,8a69d7db826df6c201827b4338cd60c2,java,技术/后端开发/Java,,,太原理工大学,"Bi,Cdn,Elk,Git,Yii,App,Php,Ext,Etl,Java,Sina,H...","['{""endDate"": ""2015-06"", ""unified"": false, ""de...","['{""endDate"": ""2020-07"", ""companyName"": ""金山云"",...","['{""name"": null}']"


In [10]:
jd_data = all_data[jd_columns]
jd_data

Unnamed: 0,jd_id,title,category_name,tags,description,requirement
0,2c9207157bcbb76b017bcde2b46d04cd,市场营销负责人,市场营销,"[""营销"",""管理"",""销售"",""战略规划"",""管理工作""]",1、岗位职责:2、主导制定公司营销战略规划并参与公司整体战略规划制定。3、全面管理公司的市场...,1、电商2、线下营销
1,2c9207157bcbb76b017bf7f2bbbb3153,会员管理经理,用户运营,"[""会员运营"",""会员管理"",""管理"",""营销"",""社群运营""]",1、岗位职责：2、1. 负责“九木杂物社”品牌全渠道会员体系的管理，规划品牌会员运营方向，包...,1、有会员管理、会员活动经验
2,2c9207157bf94d99017c108f769405e0,产品市场经理,产品经理,"[""EXCEL"",""OFFICE"",""PPT"",""UI""]",1、•Support regional strategic planning and mar...,1、find jd
3,2c9207157d2f0ec5017d5a9265ba33d9,海外区域运营经理/高级经理,运营,"[""管理"",""运营"",""团队管理"",""电商"",""游戏""]",1、负责短视频业务的运营策略制定，配合完成公司的营收业务指标；2、主导团队的业务方向，包括但...,1、本科及以上学历优先，3年以上社交、直播、游戏、电商或相关行业运营工作经验；2、英语听说读...
4,2c9207157d631abb017d6eed1c79139f,高级项目经理,高级管理,"[""项目管理"",""推广"",""归档"",""风险评估"",""交货""]",1、负责从产品建立到第一批生产交付的全过程项目管理;2、 负责实现产品要求、进度、成本、质量...,1、 本科及以上学历，计算机科学或相关专业。能用英语交流者优先;2、5年以上可穿戴相关产品项...
...,...,...,...,...,...,...
81291,8a69f6f8823a93b901823d891b761d51,技术经理,机械设备工程师,"[""材料"",""考核"",""维修"",""技术标准"",""不锈钢""]",职责描述：1、全面负责公司技术标准制、各岗位的工艺制定跟改善；2、重要设备参数收集管控包各棍...,任职要求：1、具备压延400系不锈钢材料技术相关工作经验，能对水扎四六棍扎异常原因分析解决、...
81292,8a69c468826ee9b40182831bedfc5e79,中央市场总监,市场总监,"[""市场推广"",""医疗器械"",""语言"",""材料"",""营销""]",1. 把握行业市场发展趋势，分析全球市场动态，拟定公司产品市场发展战略规划；2. 结合公司战...,1. 本科及以上学历，医学、医疗器械相关专业者优先；2. 10年及以上市场工作经验，5年以上...
81293,8a69f6f8823a93b9018247c30aca4e3d,中央市场总监,市场/营销,"[""市场推广"",""医疗器械"",""语言"",""材料"",""营销""]",1. 把握行业市场发展趋势，分析全球市场动态，拟定公司产品市场发展战略规划；2. 结合公司战...,1. 本科及以上学历，医学、医疗器械相关专业者优先；2. 10年及以上市场工作经验，5年以上...
81294,2c92071580dd3ab70180f18714e119c0,java开发工程师（不限方向）,Java,"[""java"",""微服务/性能优化"",""SQL"",""架构"",""需求分析""]",1、负责公司OA产品的功能规划、需求分析设计、技术实现和用户体验；2、负责所属模块的代码开发...,1、本科及以上学历，2年以上的Java开发及应用经验，计算机或者相关专业;2、良好的问题理解...


In [None]:
# 把下列text字段拼接起来，统一经过分词处理，得到两个list
cv_columns = ['cv_id', 'currentPosition', 'desiredPosition', 'industry', 'desiredIndustry', 'majorName', 'skills', 'eduTracks', 'jobTracks', 'projectTracks']
jd_columns = ['jd_id', 'title', 'category_name', 'tags', 'description', 'requirement']

In [24]:
type(all_data['desiredIndustry'][10])

float

## 先处理cv的文本列

In [26]:
# step1 对cv的各文本列做空值填充
cv_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81296 entries, 0 to 81295
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cv_id            81296 non-null  object
 1   currentPosition  78952 non-null  object
 2   desiredPosition  51408 non-null  object
 3   industry         15671 non-null  object
 4   desiredIndustry  9345 non-null   object
 5   majorName        80395 non-null  object
 6   skills           74527 non-null  object
 7   eduTracks        81293 non-null  object
 8   jobTracks        80324 non-null  object
 9   projectTracks    55283 non-null  object
dtypes: object(10)
memory usage: 6.2+ MB


In [30]:
for col in cv_columns[1:]:
    #print(col)
    cv_data[col].fillna('', inplace=True)  

cv_data.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_data[col].fillna('', inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81296 entries, 0 to 81295
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cv_id            81296 non-null  object
 1   currentPosition  81296 non-null  object
 2   desiredPosition  81296 non-null  object
 3   industry         81296 non-null  object
 4   desiredIndustry  81296 non-null  object
 5   majorName        81296 non-null  object
 6   skills           81296 non-null  object
 7   eduTracks        81296 non-null  object
 8   jobTracks        81296 non-null  object
 9   projectTracks    81296 non-null  object
dtypes: object(10)
memory usage: 6.2+ MB


In [31]:
# step2 将cv的文本列拼接
def col_merge_fun(series, col_list):
    '''
    合并多个文本列
    '''
    merge = ''
    for col in col_list:
        merge = merge + series[col] + ' '
    return merge.strip(' ')

cv_data['cv_text'] = cv_data.apply(col_merge_fun, axis=1, args=(cv_columns[1:], ))
cv_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_data['cv_text'] = cv_data.apply(col_merge_fun, axis=1, args=(cv_columns[1:], ))


Unnamed: 0,cv_id,currentPosition,desiredPosition,industry,desiredIndustry,majorName,skills,eduTracks,jobTracks,projectTracks,cv_text
0,2c9207157bf91042017c0d2beef00470,副总裁/副总经理,,,,国际经济与贸易,"[""Ipd"",""App"",""Kpi"",""Crm"",""View"",""优化"",""策划"",""营销""...","['{""degree"": ""本科"", ""endDate"": ""2008-07"", ""id"":...","['{""companyName"": ""深圳市亿科数字科技有限公司"", ""descriptio...","['{""companyName"": ""*深圳市蜂联科技有限公司（奇虎360）"", ""desc...","副总裁/副总经理 国际经济与贸易 [""Ipd"",""App"",""Kpi"",""Crm"",""..."
1,2c9207157bf91042017c0d979cbb047d,数字化营销经理,,餐饮业,,电气工程及其自动化,"[""H5"",""C1"",""Seo"",""Sem"",""Top"",""Kpi"",""Crm"",""优化"",...","['{""degree"": ""本科"", ""endDate"": ""2014-06"", ""id"":...","['{""companyName"": ""北京美餐好客科技有限公司"", ""companyNatu...","['{""description"": ""项目描述: 母婴类人群，包含童装、童鞋、奶粉。\\n运...","数字化营销经理 餐饮业 电气工程及其自动化 [""H5"",""C1"",""Seo"",""Sem""..."
2,2c9207157bf91042017c7232fbab3be1,,,,,Marketing,"[""Based"",""Control"",""Sap Crm"",""Improve"",""Indivi...","['{""degree"": ""硕士"", ""endDate"": ""2012-09"", ""id"":...","['{""companyName"": ""Continental Tires (China) L...",,"Marketing [""Based"",""Control"",""Sap Crm"",""Improv..."
3,2c9207157d308592017d4a94cb1113a1,泰国国家经理,,,,企业管理,"[""App"",""Resume"",""运营"",""策划"",""营销"",""搭建"",""笔译"",""舞蹈"",...","['{""degree"": ""硕士"", ""endDate"": ""2019-06"", ""id"":...","['{""companyName"": ""北京星制科技有限公司（全民快乐）"", ""descrip...","['{""description"": ""主播表现，直播平均 2 场，平台新用户 12926，主...","泰国国家经理 企业管理 [""App"",""Resume"",""运营"",""策划"",""营销"",..."
4,2c9207157dabb0a3017dbca16cdf4c26,项目经理,项目经理/主管,互联网+,"智能硬件,互联网+",英语,"[""智能硬件""]","['{""degree"": ""硕士"", ""endDate"": ""2016-07"", ""id"":...","['{""companyName"": ""酷狗音乐"", ""companyTags"": ""互联网+...","['{""companyName"": ""酷狗音乐"", ""description"": ""酷狗20...","项目经理 项目经理/主管 互联网+ 智能硬件,互联网+ 英语 [""智能硬件""] ['{""de..."
...,...,...,...,...,...,...,...,...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,生产工、包装工,生产制造/机械设计/制造/机械设备工程师,,,无,电气,"['{""endDate"": ""2022-07"", ""unified"": false, ""de...","['{""endDate"": ""2022-07"", ""companyName"": ""江门市新会...","['{""name"": null}']","生产工、包装工 生产制造/机械设计/制造/机械设备工程师 无 电气 ['{""endDat..."
81292,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']","高级市场总监 市场/市场/营销/市场营销 上海大学 搭建,翻译,功能,医学,数据库,Ed..."
81293,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']","高级市场总监 市场/市场/营销/市场营销 上海大学 搭建,翻译,功能,医学,数据库,Ed..."
81294,8a69d7db826df6c201827b4338cd60c2,java,技术/后端开发/Java,,,太原理工大学,"Bi,Cdn,Elk,Git,Yii,App,Php,Ext,Etl,Java,Sina,H...","['{""endDate"": ""2015-06"", ""unified"": false, ""de...","['{""endDate"": ""2020-07"", ""companyName"": ""金山云"",...","['{""name"": null}']","java 技术/后端开发/Java 太原理工大学 Bi,Cdn,Elk,Git,Yii,..."


In [33]:
# step3 对cv_text列进行分词
def col_jieba_fun(series, col_name):
    '''
    将文本字符串切词成列表
    '''
    text = series[col_name]
    
    # 切词，精确模式
    col_list = jieba.lcut(text, cut_all=False)
    return col_list

# 比较耗时
cv_data['cv_text_jieba'] = cv_data.apply(col_jieba_fun, axis=1, args=('cv_text', ))
cv_data

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/99/wft1658s2mv39cv4wj_rcpqh0000gp/T/jieba.cache
Loading model cost 0.881 seconds.
Prefix dict has been built successfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_data['cv_text_jieba'] = cv_data.apply(col_jieba_fun, axis=1, args=('cv_text', ))


Unnamed: 0,cv_id,currentPosition,desiredPosition,industry,desiredIndustry,majorName,skills,eduTracks,jobTracks,projectTracks,cv_text,cv_text_jieba
0,2c9207157bf91042017c0d2beef00470,副总裁/副总经理,,,,国际经济与贸易,"[""Ipd"",""App"",""Kpi"",""Crm"",""View"",""优化"",""策划"",""营销""...","['{""degree"": ""本科"", ""endDate"": ""2008-07"", ""id"":...","['{""companyName"": ""深圳市亿科数字科技有限公司"", ""descriptio...","['{""companyName"": ""*深圳市蜂联科技有限公司（奇虎360）"", ""desc...","副总裁/副总经理 国际经济与贸易 [""Ipd"",""App"",""Kpi"",""Crm"",""...","[副总裁, /, 副总经理, , , , , 国际, 经济, 与, 贸易, , [..."
1,2c9207157bf91042017c0d979cbb047d,数字化营销经理,,餐饮业,,电气工程及其自动化,"[""H5"",""C1"",""Seo"",""Sem"",""Top"",""Kpi"",""Crm"",""优化"",...","['{""degree"": ""本科"", ""endDate"": ""2014-06"", ""id"":...","['{""companyName"": ""北京美餐好客科技有限公司"", ""companyNatu...","['{""description"": ""项目描述: 母婴类人群，包含童装、童鞋、奶粉。\\n运...","数字化营销经理 餐饮业 电气工程及其自动化 [""H5"",""C1"",""Seo"",""Sem""...","[数字化, 营销, 经理, , , 餐饮业, , , 电气工程, 及其, 自动化, ..."
2,2c9207157bf91042017c7232fbab3be1,,,,,Marketing,"[""Based"",""Control"",""Sap Crm"",""Improve"",""Indivi...","['{""degree"": ""硕士"", ""endDate"": ""2012-09"", ""id"":...","['{""companyName"": ""Continental Tires (China) L...",,"Marketing [""Based"",""Control"",""Sap Crm"",""Improv...","[Marketing, , [, "", Based, "", ,, "", Control, ..."
3,2c9207157d308592017d4a94cb1113a1,泰国国家经理,,,,企业管理,"[""App"",""Resume"",""运营"",""策划"",""营销"",""搭建"",""笔译"",""舞蹈"",...","['{""degree"": ""硕士"", ""endDate"": ""2019-06"", ""id"":...","['{""companyName"": ""北京星制科技有限公司（全民快乐）"", ""descrip...","['{""description"": ""主播表现，直播平均 2 场，平台新用户 12926，主...","泰国国家经理 企业管理 [""App"",""Resume"",""运营"",""策划"",""营销"",...","[泰国, 国家, 经理, , , , , 企业, 管理, , [, "", App,..."
4,2c9207157dabb0a3017dbca16cdf4c26,项目经理,项目经理/主管,互联网+,"智能硬件,互联网+",英语,"[""智能硬件""]","['{""degree"": ""硕士"", ""endDate"": ""2016-07"", ""id"":...","['{""companyName"": ""酷狗音乐"", ""companyTags"": ""互联网+...","['{""companyName"": ""酷狗音乐"", ""description"": ""酷狗20...","项目经理 项目经理/主管 互联网+ 智能硬件,互联网+ 英语 [""智能硬件""] ['{""de...","[项目经理, , 项目经理, /, 主管, , 互联网, +, , 智能, 硬件, ,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,生产工、包装工,生产制造/机械设计/制造/机械设备工程师,,,无,电气,"['{""endDate"": ""2022-07"", ""unified"": false, ""de...","['{""endDate"": ""2022-07"", ""companyName"": ""江门市新会...","['{""name"": null}']","生产工、包装工 生产制造/机械设计/制造/机械设备工程师 无 电气 ['{""endDat...","[生产, 工, 、, 包装工, , 生产, 制造, /, 机械设计, /, 制造, /, ..."
81292,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']","高级市场总监 市场/市场/营销/市场营销 上海大学 搭建,翻译,功能,医学,数据库,Ed...","[高级, 市场总监, , 市场, /, 市场, /, 营销, /, 市场营销, , ,..."
81293,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']","高级市场总监 市场/市场/营销/市场营销 上海大学 搭建,翻译,功能,医学,数据库,Ed...","[高级, 市场总监, , 市场, /, 市场, /, 营销, /, 市场营销, , ,..."
81294,8a69d7db826df6c201827b4338cd60c2,java,技术/后端开发/Java,,,太原理工大学,"Bi,Cdn,Elk,Git,Yii,App,Php,Ext,Etl,Java,Sina,H...","['{""endDate"": ""2015-06"", ""unified"": false, ""de...","['{""endDate"": ""2020-07"", ""companyName"": ""金山云"",...","['{""name"": null}']","java 技术/后端开发/Java 太原理工大学 Bi,Cdn,Elk,Git,Yii,...","[java, , 技术, /, 后, 端, 开发, /, Java, , , , 太..."


In [36]:
cv_data['cv_text_jieba']

0        [副总裁, /, 副总经理,  ,  ,  ,  , 国际, 经济, 与, 贸易,  , [...
1        [数字化, 营销, 经理,  ,  , 餐饮业,  ,  , 电气工程, 及其, 自动化, ...
2        [Marketing,  , [, ", Based, ", ,, ", Control, ...
3        [泰国, 国家, 经理,  ,  ,  ,  , 企业, 管理,  , [, ", App,...
4        [项目经理,  , 项目经理, /, 主管,  , 互联网, +,  , 智能, 硬件, ,...
                               ...                        
81291    [生产, 工, 、, 包装工,  , 生产, 制造, /, 机械设计, /, 制造, /, ...
81292    [高级, 市场总监,  , 市场, /, 市场, /, 营销, /, 市场营销,  ,  ,...
81293    [高级, 市场总监,  , 市场, /, 市场, /, 营销, /, 市场营销,  ,  ,...
81294    [java,  , 技术, /, 后, 端, 开发, /, Java,  ,  ,  , 太...
81295    [零售, 营销部, 负责人,  , 运营, /, 运营, /, 电商, 运营,  ,  , ...
Name: cv_text_jieba, Length: 81296, dtype: object

In [37]:
# step4 对cv_text_jieba列进行过滤
def col_jieba_filter_fun(series, col_name_jieba):
    '''
    对切词后的列表进行过滤
    '''
    col_list_filter = []
    
    # 得到切词后的文本列表
    col_list = series[col_name_jieba]

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return col_list_filter

cv_data['cv_text_jieba_filter'] = cv_data.apply(col_jieba_filter_fun, axis=1, args=('cv_text_jieba', ))
cv_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_data['cv_text_jieba_filter'] = cv_data.apply(col_jieba_filter_fun, axis=1, args=('cv_text_jieba', ))


Unnamed: 0,cv_id,currentPosition,desiredPosition,industry,desiredIndustry,majorName,skills,eduTracks,jobTracks,projectTracks,cv_text,cv_text_jieba,cv_text_jieba_filter
0,2c9207157bf91042017c0d2beef00470,副总裁/副总经理,,,,国际经济与贸易,"[""Ipd"",""App"",""Kpi"",""Crm"",""View"",""优化"",""策划"",""营销""...","['{""degree"": ""本科"", ""endDate"": ""2008-07"", ""id"":...","['{""companyName"": ""深圳市亿科数字科技有限公司"", ""descriptio...","['{""companyName"": ""*深圳市蜂联科技有限公司（奇虎360）"", ""desc...","副总裁/副总经理 国际经济与贸易 [""Ipd"",""App"",""Kpi"",""Crm"",""...","[副总裁, /, 副总经理, , , , , 国际, 经济, 与, 贸易, , [...","[副总裁, 副总经理, 国际, 经济, 贸易, ipd, app, kpi, crm, vi..."
1,2c9207157bf91042017c0d979cbb047d,数字化营销经理,,餐饮业,,电气工程及其自动化,"[""H5"",""C1"",""Seo"",""Sem"",""Top"",""Kpi"",""Crm"",""优化"",...","['{""degree"": ""本科"", ""endDate"": ""2014-06"", ""id"":...","['{""companyName"": ""北京美餐好客科技有限公司"", ""companyNatu...","['{""description"": ""项目描述: 母婴类人群，包含童装、童鞋、奶粉。\\n运...","数字化营销经理 餐饮业 电气工程及其自动化 [""H5"",""C1"",""Seo"",""Sem""...","[数字化, 营销, 经理, , , 餐饮业, , , 电气工程, 及其, 自动化, ...","[数字化, 营销, 经理, 餐饮业, 电气工程, 及其, 自动化, h5, c1, seo,..."
2,2c9207157bf91042017c7232fbab3be1,,,,,Marketing,"[""Based"",""Control"",""Sap Crm"",""Improve"",""Indivi...","['{""degree"": ""硕士"", ""endDate"": ""2012-09"", ""id"":...","['{""companyName"": ""Continental Tires (China) L...",,"Marketing [""Based"",""Control"",""Sap Crm"",""Improv...","[Marketing, , [, "", Based, "", ,, "", Control, ...","[marketing, based, control, sap, crm, improve,..."
3,2c9207157d308592017d4a94cb1113a1,泰国国家经理,,,,企业管理,"[""App"",""Resume"",""运营"",""策划"",""营销"",""搭建"",""笔译"",""舞蹈"",...","['{""degree"": ""硕士"", ""endDate"": ""2019-06"", ""id"":...","['{""companyName"": ""北京星制科技有限公司（全民快乐）"", ""descrip...","['{""description"": ""主播表现，直播平均 2 场，平台新用户 12926，主...","泰国国家经理 企业管理 [""App"",""Resume"",""运营"",""策划"",""营销"",...","[泰国, 国家, 经理, , , , , 企业, 管理, , [, "", App,...","[泰国, 国家, 经理, 企业, 管理, app, resume, 运营, 策划, 营销, ..."
4,2c9207157dabb0a3017dbca16cdf4c26,项目经理,项目经理/主管,互联网+,"智能硬件,互联网+",英语,"[""智能硬件""]","['{""degree"": ""硕士"", ""endDate"": ""2016-07"", ""id"":...","['{""companyName"": ""酷狗音乐"", ""companyTags"": ""互联网+...","['{""companyName"": ""酷狗音乐"", ""description"": ""酷狗20...","项目经理 项目经理/主管 互联网+ 智能硬件,互联网+ 英语 [""智能硬件""] ['{""de...","[项目经理, , 项目经理, /, 主管, , 互联网, +, , 智能, 硬件, ,...","[项目经理, 项目经理, 主管, 互联网, 智能, 硬件, 互联网, 英语, 智能, 硬件,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
81291,8a69d7c28219945301823d8a08d14e70,生产工、包装工,生产制造/机械设计/制造/机械设备工程师,,,无,电气,"['{""endDate"": ""2022-07"", ""unified"": false, ""de...","['{""endDate"": ""2022-07"", ""companyName"": ""江门市新会...","['{""name"": null}']","生产工、包装工 生产制造/机械设计/制造/机械设备工程师 无 电气 ['{""endDat...","[生产, 工, 、, 包装工, , 生产, 制造, /, 机械设计, /, 制造, /, ...","[生产, 包装工, 生产, 制造, 机械设计, 制造, 机械设备, 工程师, 电气, end..."
81292,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']","高级市场总监 市场/市场/营销/市场营销 上海大学 搭建,翻译,功能,医学,数据库,Ed...","[高级, 市场总监, , 市场, /, 市场, /, 营销, /, 市场营销, , ,...","[高级, 市场总监, 市场, 市场, 营销, 市场营销, 上海大学, 搭建, 翻译, 功能,..."
81293,8a69d7c282199453018247bac4a36859,高级市场总监,市场/市场/营销/市场营销,,,上海大学,"搭建,翻译,功能,医学,数据库,Education,产品管理,产品经理,医疗器械,英语听说读写","['{""schoolTag"": ""211"", ""endDate"": ""1999-07"", ""...","['{""endDate"": ""2022-07"", ""companyName"": ""上海微创医...","['{""name"": null}']","高级市场总监 市场/市场/营销/市场营销 上海大学 搭建,翻译,功能,医学,数据库,Ed...","[高级, 市场总监, , 市场, /, 市场, /, 营销, /, 市场营销, , ,...","[高级, 市场总监, 市场, 市场, 营销, 市场营销, 上海大学, 搭建, 翻译, 功能,..."
81294,8a69d7db826df6c201827b4338cd60c2,java,技术/后端开发/Java,,,太原理工大学,"Bi,Cdn,Elk,Git,Yii,App,Php,Ext,Etl,Java,Sina,H...","['{""endDate"": ""2015-06"", ""unified"": false, ""de...","['{""endDate"": ""2020-07"", ""companyName"": ""金山云"",...","['{""name"": null}']","java 技术/后端开发/Java 太原理工大学 Bi,Cdn,Elk,Git,Yii,...","[java, , 技术, /, 后, 端, 开发, /, Java, , , , 太...","[java, 技术, 开发, java, 太原, 理工大学, bi, cdn, elk, g..."


In [39]:
cv_data['cv_text_jieba_filter'][0]

['副总裁',
 '副总经理',
 '国际',
 '经济',
 '贸易',
 'ipd',
 'app',
 'kpi',
 'crm',
 'view',
 '优化',
 '策划',
 '营销',
 '运营',
 '搭建',
 '电商',
 '前端',
 '架构',
 '财务',
 '客服',
 '迭代',
 '数据',
 '营销策划',
 '需求',
 '管理',
 '网络设备',
 '电子商务',
 '合同',
 '管理',
 '成本',
 '分析',
 '运营',
 '管理',
 '市场营销',
 '产品',
 '管理',
 '需求',
 '分析',
 '经营',
 '管理',
 '风险',
 '控制',
 '产品开发',
 '团队',
 '建设',
 'degree',
 '本科',
 'enddate',
 'id',
 '2c9207157dbedf8d017dc28fbc5d336d',
 'majorname',
 '国际',
 '经济',
 '贸易',
 'schoolname',
 '湖南',
 '科技',
 '学院',
 'startdate',
 'companyname',
 '深圳市',
 '亿科',
 '数字',
 '科技',
 '有限公司',
 'description',
 '工作',
 '职责',
 '公司',
 '管理',
 'n1',
 '确定',
 '发展',
 '方向',
 '结合',
 '公司',
 '特点',
 '竞争',
 '优势',
 '行业',
 '机会',
 '信息',
 '确定',
 '公司',
 '未来',
 '主要',
 '从事',
 '行业',
 '赛道',
 '移动',
 '互联网',
 '创意',
 '为主',
 '技术',
 '导向',
 '三大',
 '战略',
 'n2',
 '企业',
 '文化',
 '建设',
 'ceo',
 '一道',
 '深入',
 '公司',
 '文化',
 '进行',
 '总结',
 '分析',
 '通过',
 '投票',
 '讨论',
 '调研',
 '方式',
 '最终',
 '确定',
 '公司',
 '文化',
 '各个',
 '事业部',
 '进行',
 '推行',
 '落实',
 'n3',
 '内部',
 '流程',
 '优化',
 '协同',


In [41]:
a = [1,2,3]
b = ['hi', 'hello', 'world']
for i,j in zip(a,b):
    print(i,j)

1 hi
2 hello
3 world
