In [None]:
import re
import json
import numpy as np
import pandas as pd
import jieba
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [None]:
# 读入20220821所有数据
all_data = pd.read_csv('../data/all_sample_20220821_spark.csv').drop(['Unnamed: 0'], axis=1)

In [None]:
# 显示所有的列
all_data.columns

In [None]:
# 注意一下，这里的列名和sample pipeline中的列名有一点大小写区别，问题不大
cv_columns = ['cv_id', 'currentPosition', 'desiredPosition', 'industry', 'desiredIndustry', 'majorName', 'skills', 'eduTracks', 'jobTracks', 'projectTracks']
jd_columns = ['jd_id', 'title', 'category_name', 'tags', 'description', 'requirement']

In [None]:
jd_data = all_data[jd_columns]
jd_data

In [None]:
cv_data = all_data[cv_columns]
cv_data

jd可以做3个向量
* title + category_name + tags
* description
* requirement

cv可以做4个向量：
* currentPosition + desiredPosition
* skills
* jobTracks
* projectTracks

向量1：title + category_name + tags

In [None]:
# step1：缺失值填充

jd_data['title'].fillna('', inplace=True)
jd_data['category_name'].fillna('', inplace=True)
jd_data['tags'].fillna('[]', inplace=True)

In [None]:
# step2：定义新列，将title + category_name合并

col_name1 = 'title'
col_name2 = 'category_name'

def col_merge_str_fun(series):
    '''
    适用于文本列合并
    '''
    return series[col_name1] + ' ' + series[col_name2]

jd_data['title_category'] = jd_data.apply(col_merge_str_fun, axis=1)
jd_data

In [None]:
# step3：将新列title_category通过jieba分词处理成列表

col_name = 'title_category'

def col_jieba_fun(series):
    col = series[col_name]

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，|/| ", col)

    # 对于中文，进入jieba前不需要添加空格
    # 不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

jd_data['title_category_jieba'] = jd_data.apply(col_jieba_fun, axis=1)
jd_data


In [None]:
# step4：将tags列通过jieba分词处理成列表

col_name = 'tags'

def col_jieba_fun(series):
    col = series[col_name]

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，|/| ", col)

    # 对于中文，进入jieba前不需要添加空格
    # 不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

jd_data['tags_jieba'] = jd_data.apply(col_jieba_fun, axis=1)
jd_data


In [None]:
# step5：将title_category_jieba和tags_jieba两个列表合并

col_name1 = 'title_category_jieba'
col_name2 = 'tags_jieba'

def col_merge_list_fun(series):
    '''
    适用于列表列合并
    '''
    return series[col_name1] + series[col_name2]

jd_data['title_category_tags_jieba'] = jd_data.apply(col_merge_list_fun, axis=1)
jd_data


In [None]:
# step6：过滤title_category_tags_jieba, 得到以空格分割的字符串

col_name = 'title_category_tags_jieba'
def col_jieba_filter_fun(series):

    col_list_filter = []
    # 得到tag列表
    col_list = series[col_name]
    # print(tags_list[0])

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return " ".join(col_list_filter)

jd_data['title_category_tags_jieba_filter'] = jd_data.apply(col_jieba_filter_fun, axis=1)
jd_data

In [None]:
# step7：将title_category_tags_jieba_filter转成tfidf向量

def get_tfidf(df, col_name):
    # col_name必须是以空格分割的字符串
    text = df[col_name]
    
    vectorizer = TfidfVectorizer()
    # 返回的是nparray
    vector = vectorizer.fit_transform(text)
    return pd.DataFrame(vector.toarray()), vectorizer

tfidf, vectorizer = get_tfidf(jd_data, 'title_category_tags_jieba_filter')
tfidf


In [None]:
vectorizer.vocabulary_

In [None]:
# step8：将tfidf降维到n维

def get_tfidf_pca(tfidf, n=20):
    pca = PCA(n_components=n)
    tfidf_pca = pca.fit_transform(tfidf)
    tfidf_pca = pd.DataFrame(tfidf_pca)
    return tfidf_pca

tfidf_pca = get_tfidf_pca(tfidf, 10)
tfidf_pca

向量2：description

In [None]:
# step1：缺失值填充

jd_data['description'].fillna('', inplace=True)

In [None]:
# step2：将desciption列通过jieba分词处理成列表

col_name = 'description'

def col_jieba_fun(series):
    col = series[col_name]

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，|/| ", col)

    # 对于中文，进入jieba前不需要添加空格
    # 不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

jd_data['description_jieba'] = jd_data.apply(col_jieba_fun, axis=1)
jd_data


In [None]:
# step3：过滤description_jieba，得到以空格分割的字符串

col_name = 'description_jieba'

def col_jieba_filter_fun(series):

    col_list_filter = []
    # 得到tag列表
    col_list = series[col_name]
    # print(tags_list[0])

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return " ".join(col_list_filter)

jd_data['description_jieba_filter'] = jd_data.apply(col_jieba_filter_fun, axis=1)
jd_data

In [None]:
# step4：将description_jieba_filter转成tfidf向量

def get_tfidf(df, col_name):
    # col_name必须是以空格分割的字符串
    text = df[col_name]
    
    vectorizer = TfidfVectorizer()
    # 返回的是nparray
    vector = vectorizer.fit_transform(text)
    return pd.DataFrame(vector.toarray()), vectorizer

tfidf, vectorizer = get_tfidf(jd_data, 'description_jieba_filter')
tfidf

In [None]:
vectorizer.vocabulary_

In [129]:
# step5：将tfidf降维到n维

def get_tfidf_pca(tfidf, n=20):
    pca = PCA(n_components=n)
    tfidf_pca = pca.fit_transform(tfidf)
    tfidf_pca = pd.DataFrame(tfidf_pca)
    return tfidf_pca

tfidf_pca = get_tfidf_pca(tfidf, 10)
tfidf_pca

KeyboardInterrupt: 