In [85]:
import json
import jieba
import re
import numpy as np
import pandas as pd
from zhconv import convert
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA


# 读取csv文件
def load_csv_data(data_path):
    df = pd.read_csv(data_path)
    return df

# 将某列的文本字符串转成列表并切词
def col_jieba_fun(series):
    # 获得字符串
    col = series[col_name]

    # 字符串变列表
    if col.startswith("[") and col.endswith("]"):
        col = json.loads(col)
    else:
        col = re.split(",|，", col)

    # 对于中文，进入jieba前不需要添加空格
    # 不过，如果是中英文混合，就必须空格了
    col_str = " ".join(col)
    col_list = jieba.lcut(col_str, cut_all=False)
    return col_list

# 对切词后的列表进行过滤
def col_jieba_filter_fun(series):

    col_list_filter = []
    # 得到tag列表
    col_list = series[col_name_jieba]
    # print(tags_list[0])

    pun_masks_english = [",", ".", "/", "[", "]", "{", "}", "(", ")", ":", "*", "#", "!", " ", "\"", "\\"]
    pun_masks_chinese = ["，", "。", "、", "（", "）", "：", "！", "”", "“"]
    pun_masks = pun_masks_english + pun_masks_chinese

    # 过滤
    for tag in col_list:
        # 转中文简体
        tag = convert(tag, "zh-hans")
        # 转英文小写
        tag = tag.lower()

        # 过滤数字
        if tag.isdigit():
            continue
        
        # 过滤单个字符
        if len(tag) <= 1:
            continue
        
        # 过滤标点
        flag = 1
        for pun in pun_masks:
            if pun in tag:
                flag = 0
                break
        if flag == 1:
            col_list_filter.append(tag)
    return " ".join(col_list_filter)

# 转成tfidf向量
def get_tfidf(df, col_name):
    text = df[col_name]
    
    vectorizer = TfidfVectorizer()
    # 返回的是nparray
    vector = vectorizer.fit_transform(text)
    return pd.DataFrame(vector.toarray()), vectorizer

# 对tfidf降维到n维
def get_tfidf_pca(tfidf, n=20):
    pca = PCA(n_components=n)
    tfidf_pca = pca.fit_transform(tfidf)
    tfidf_pca = pd.DataFrame(tfidf_pca)
    return tfidf_pca

# 测试tfidf是否正确
def test_tfidf(num=0):
    # 获取第num条数据
    words = df[col_name_jieba_filter][num].split(" ")
    print(words)

    # 交换vocabulary_的key和value。交换后可以通过索引得到词。
    new_vocab = {}
    for word, index in vectorizer.vocabulary_.items():
        new_vocab[str(index)] = word

    # 第num条数据的非零值对应的word
    for index, value in enumerate(tfidf.iloc[num]):
        if value:
            print(new_vocab[str(index)], value)
    
    # 每个词的idf，保存为文件
    with open("./word_idf_temp", "w") as fo:
        for word, index in vectorizer.vocabulary_.items():
            fo.write("{}\t{}\n".format(word, vectorizer.idf_[index]))

if __name__ == "__main__":
    print("running...")

    # csv文件和待处理的列名
    data_path = './sample_20220821_spark.csv'
    col_name = 'tags'
    #col_name = 'skills'
    col_name_jieba = col_name + '_jieba'
    col_name_jieba_filter = col_name_jieba + '_filter'

    # 读取csv文件
    df = load_csv_data(data_path)
    # 空值填充
    df[col_name].fillna('[]', inplace=True)
    # 对要处理的文本列进行预处理
    df[col_name_jieba] = df.apply(col_jieba_fun, axis=1)
    df[col_name_jieba_filter] = df.apply(col_jieba_filter_fun, axis=1)
    print(df[[col_name, col_name_jieba, col_name_jieba_filter]])

    tfidf, vectorizer = get_tfidf(df, col_name_jieba_filter)
    print(tfidf)

    tfidf_pca = get_tfidf_pca(tfidf, 10)
    print(tfidf_pca)

    print("all is well")

    print("============================test=============================")
   
    test_tfidf(0)
    


running...
                                              tags  \
0                   ["市场策划","管理","推广","营销","品牌推广"]   
1                 ["销售","团队管理","市场推广","AI","负责管理"]   
2                          ["投资","临床","生物","生物医药"]   
3                ["运营","产品运营","内容运营","美工","互联网产品"]   
4                    ["架构","数字化","管理","云计算","区块链"]   
...                                            ...   
81291                 ["仓储","物流","管理","沟通能力","软件"]   
81292  ["Photoshop","Coredraw","AI","3D工业软件","软件"]   
81293                 ["游戏","动画设计","审核","美术","软件"]   
81294               ["沟通能力","归档","法规","评审","产品开发"]   
81295               ["结构设计","管理","审核","软件","管理工作"]   

                                              tags_jieba  \
0                 [市场策划,  , 管理,  , 推广,  , 营销,  , 品牌, 推广]   
1               [销售,  , 团队, 管理,  , 市场推广,  , AI,  , 负责管理]   
2                            [投资,  , 临床,  , 生物,  , 生物医药]   
3          [运营,  , 产品, 运营,  , 内容, 运营,  , 美工,  , 互联网, 产品]   
4                [架构,  , 数字化,  , 管理,  , 

In [97]:
test_tfidf(111)


['数据分析', '技术', '方案', '互联网', '产品', '需求', '管理']
互联网 0.47612056972096445
产品 0.32904023658877735
技术 0.40686379057255656
数据分析 0.3180988719955959
方案 0.49452764813064415
管理 0.17488625394013943
需求 0.3509613835842982


In [98]:
tfidf_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.078506,0.069653,-0.017775,0.105460,-0.015456,0.011069,-0.019574,-0.080937,-0.065339,-0.043749
1,0.038499,0.199569,-0.020203,0.144933,0.041004,-0.042796,0.011023,0.014379,-0.037612,-0.088568
2,-0.087989,0.017933,-0.061932,0.007295,-0.078234,-0.045060,-0.016852,0.000884,-0.077567,-0.028849
3,0.342554,-0.210275,-0.193948,-0.068139,-0.015440,0.054701,0.014075,0.012581,-0.124234,0.004898
4,-0.075882,0.047468,-0.041079,-0.032534,0.093795,-0.031694,0.032983,-0.018421,0.022982,0.015475
...,...,...,...,...,...,...,...,...,...,...
81291,0.004201,0.001396,0.291790,0.001401,0.002736,-0.070966,0.025761,0.022762,0.076789,0.130943
81292,-0.116068,0.012803,-0.028366,0.047559,-0.030882,-0.001192,0.007218,0.041145,0.098560,0.174029
81293,-0.091085,-0.021279,-0.044082,0.019146,-0.138485,-0.007206,0.044944,-0.079972,0.298801,-0.012655
81294,-0.049421,-0.081470,0.239977,0.004083,-0.067859,-0.025214,-0.010085,0.021041,-0.045522,0.023043


In [99]:
np.array(tfidf_pca)

array([[ 0.07850606,  0.06965283, -0.01777515, ..., -0.08093693,
        -0.06533894, -0.0437494 ],
       [ 0.03849911,  0.19956851, -0.02020297, ...,  0.01437862,
        -0.03761168, -0.0885675 ],
       [-0.08798878,  0.01793312, -0.06193151, ...,  0.00088401,
        -0.0775668 , -0.02884898],
       ...,
       [-0.09108474, -0.02127907, -0.04408201, ..., -0.07997211,
         0.29880103, -0.01265549],
       [-0.04942121, -0.08147001,  0.23997748, ...,  0.02104102,
        -0.04552182,  0.02304303],
       [-0.04193919,  0.16747169,  0.0125846 , ...,  0.05627093,
         0.14220081,  0.13950575]])

In [100]:
# 将文本列保存下来。格式：index + 文本
def save_text():
    file = col_name_jieba_filter + '_temp'
    with open(file, "w") as fo:
        for index, line in enumerate(df[col_name_jieba_filter]):
            fo.write("{}:\t{}\n".format(index, line))

save_text()

In [125]:
test_tfidf(151)


test_tfidf(162)


['游戏', 'mmo', '运营', '游戏', '投放']
mmo 0.5770228683485739
投放 0.4700735789643929
游戏 0.6290027019926157
运营 0.22456856558383226
['游戏', 'mmo', '游戏', '管理', '投放']
mmo 0.5840210885654143
投放 0.47577470209860445
游戏 0.6366313414573394
管理 0.16510098736252693


In [133]:
np.array(tfidf_pca.iloc[151])

array([ 0.11034272, -0.17189556, -0.12318091, -0.05575076, -0.21421186,
        0.073257  , -0.0301226 , -0.25441586,  0.37486438, -0.21425201])

In [167]:
def get_text(num=0):
    '''
    获取第num条数据
    '''
    words = df[col_name_jieba_filter][num].split(" ")
    print("第{}条数据: {}".format(num+1, words))

def get_cos_sim(cv_feature, jd_feature):
    '''
    获取余弦相似度特征
    '''
    if not cv_feature or not jd_feature:
        return 0
    cv_feature = np.array(cv_feature)
    jd_feature = np.array(jd_feature)
    cv_feature_norm = np.linalg.norm(cv_feature)
    jd_feature_norm = np.linalg.norm(jd_feature)
    product = cv_feature_norm * jd_feature_norm
    if product == 0:
        return 0
    sim = np.dot(cv_feature, jd_feature) / product
    return sim

def get_sim(n1, n2, n3):
    '''
    传入3个文本的index，返回两两相似度
    '''
    get_text(n1)
    get_text(n2)
    get_text(n3)

    t1 = list(tfidf_pca.iloc[n1])
    t2 = list(tfidf_pca.iloc[n2])
    t3 = list(tfidf_pca.iloc[n3])
    # print(t1)
    # print(t2)
    # print(t3)

    sim_12 = get_cos_sim(t1, t2)
    sim_13 = get_cos_sim(t1, t3)
    sim_23 = get_cos_sim(t2, t3)

    print("{}_{}_{}:\t{}".format('sim', n1+1, n2+1, sim_12))
    print("{}_{}_{}:\t{}".format('sim', n1+1, n3+1, sim_13))
    print("{}_{}_{}:\t{}".format('sim', n2+1, n3+1, sim_23))


#get_sim(8,151,162)
#get_sim(8,157,158)
#get_sim(8,167,168)
#get_sim(0,7,8)

get_sim(0,1,2)

第1条数据: ['市场策划', '管理', '推广', '营销', '品牌', '推广']
第2条数据: ['销售', '团队', '管理', '市场推广', 'ai', '负责管理']
第3条数据: ['投资', '临床', '生物', '生物医药']
sim_1_2:	0.6976034025379111
sim_1_3:	0.1122182400863795
sim_2_3:	0.14346691254137894


In [129]:
sum(tfidf.iloc[151])


1.9006677148894147

In [128]:
sum(tfidf.iloc[162])

1.861528119483885

In [124]:
cat test | awk '{sum += $2}; END{print sum}'

2.32505


In [23]:
df1['tags_jieba_filter']

# 将新生成的列tags_jie保存下来。可以发现需要进一步过滤。
with open('./tags_jieba_filter_temp', "w") as fo:
    for line in df1['tags_jieba_filter']:
        fo.write(line + '\n')


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 文件列表
# text = ["The quick brown fox jumped over the lazy dog.",
#         "The dog.",
#         "The fox"]
text = df1['tags_jieba_filter']
# 建立transform
vectorizer = TfidfVectorizer()
# 分词，建立词汇表
vectorizer.fit(text)

# 输出结果
#print(vectorizer.vocabulary_)
#print(vectorizer.idf_)

with open("./word_idf_temp", "w") as fo:
    for word, index in vectorizer.vocabulary_.items():
        fo.write("{}\t{}\n".format(word, vectorizer.idf_[index]))
        #print(word, vectorizer.idf_[index])
    

# 词典的大小
print(len(vectorizer.vocabulary_))
# 编码文件
vector = vectorizer.transform(text)
# 输出编码结果
print(vector.shape)
print(vector.toarray())


2393
(81296, 2393)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [84]:
#tfidf['sum'] = tfidf.apply(sum, axis=1)
#print(tfidf)
words = '市场策划 管理 推广 营销 品牌'.split(" ")
#words = '销售 团队 管理 市场推广 ai 负责管理'.split(" ")

# 交换vocabulary_的key和value
new_vocab = {}
for word, index in vectorizer.vocabulary_.items():
    new_vocab[str(index)] = word



# for word in words:
#     print(vectorizer.vocabulary_[word])

for index, value in enumerate(tfidf.iloc[0]):
    if value:
        print(new_vocab[str(index)], value)


品牌 0.3950075968973809
市场策划 0.6806409234097276
推广 0.5563615667401274
管理 0.13436304390813317
营销 0.23044589659944326


In [73]:
new_vocab

{'1238': '市场策划',
 '1907': '管理',
 '1430': '推广',
 '2063': '营销',
 '1024': '品牌',
 '2309': '销售',
 '1053': '团队',
 '1236': '市场推广',
 '23': 'ai',
 '2159': '负责管理',
 '1377': '投资',
 '638': '临床',
 '1766': '生物',
 '1768': '生物医药',
 '2237': '运营',
 '677': '产品',
 '834': '内容',
 '1998': '美工',
 '665': '互联网',
 '1596': '架构',
 '1482': '数字化',
 '2103': '计算',
 '936': '区块',
 '403': 'pr',
 '246': 'ie',
 '124': 'ct',
 '34': 'ar',
 '1342': '成本核算',
 '1341': '成本',
 '1648': '沟通',
 '2031': '能力',
 '1173': '审核',
 '1494': '数据库',
 '1678': '测试',
 '296': 'linux',
 '1450': '操作系统',
 '1208': '嵌入式',
 '1680': '测试报告',
 '2160': '财务',
 '2076': '融资',
 '863': '分析',
 '2152': '调研',
 '2091': '规划',
 '2364': '风险管理',
 '2155': '调配',
 '771': '信息',
 '1463': '收集',
 '2316': '门店',
 '1581': '机器人',
 '770': '信号处理',
 '1617': '检测',
 '1904': '算法',
 '2346': '项目管理',
 '946': '医药',
 '1561': '服务',
 '1712': '渠道',
 '1039': '商务',
 '1394': '拓展',
 '860': '分布式',
 '2143': '语言',
 '2331': '需求',
 '1365': '技术',
 '1524': '方案',
 '1598': '架构设计',
 '718': '代码',
 '1974': '编写'

In [34]:
# 市场策划 管理 推广 营销 品牌
# 销售 团队 管理 市场推广 ai 负责管理
words = ['市场策划', '管理', '推广', '营销', '品牌']
words = '销售 团队 管理 市场推广 ai 负责管理'.split(" ")
for word in words:
    print(vectorizer.vocabulary_[word])

2309
1053
1907
1236
23
2159


In [None]:
'''
接下来工作
1.切词后做一些过滤操作，比如去除数字、单个字符、标点等
2.降维
'''

In [39]:
# 对数据进行PCA降维
from sklearn.decomposition import PCA
pca = PCA(n_components = 20)
tfidf_reduce = pca.fit_transform(tfidf)
pd.DataFrame(tfidf_reduce)
# 对降维后的结果进行测评

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.078512,0.069618,-0.017895,0.105482,-0.015283,0.007682,-0.014454,-0.085960,-0.070056,-0.048234,0.071195,0.237524,-0.119979,0.008440,-0.013928,-0.024121,0.008485,0.189364,0.016720,0.008770
1,0.038504,0.199545,-0.020085,0.144840,0.041053,-0.043275,0.008736,0.017601,-0.036109,-0.094365,0.062546,-0.097310,-0.069673,-0.088297,-0.094070,0.055580,0.046753,-0.074511,-0.046563,0.023351
2,-0.087994,0.017962,-0.061977,0.007464,-0.077803,-0.044911,-0.014456,-0.003622,-0.078271,-0.027265,-0.073952,0.031800,0.071627,0.037743,-0.085895,-0.033185,-0.079899,-0.014100,0.002412,-0.019882
3,0.342511,-0.210294,-0.193566,-0.068237,-0.016399,0.055897,0.005739,0.020350,-0.115034,-0.005907,0.008783,-0.129218,-0.053976,0.303977,0.047035,-0.057617,-0.041459,-0.175098,0.083127,-0.012061
4,-0.075887,0.047373,-0.040878,-0.032711,0.093098,-0.033183,0.029436,-0.013971,0.022257,0.008198,0.030214,-0.017722,-0.053762,0.033846,-0.055365,0.016291,-0.023274,0.049182,-0.038961,-0.102916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81291,0.004222,0.001419,0.291622,0.001515,0.003247,-0.070986,0.029892,0.018149,0.072668,0.135787,-0.071464,-0.023122,-0.031085,0.050931,-0.060297,-0.015337,0.033086,0.016529,-0.043093,0.072987
81292,-0.116039,0.012818,-0.028575,0.047633,-0.030192,-0.001297,0.012773,0.034465,0.091745,0.181915,-0.113946,-0.018592,-0.072005,0.052709,-0.055076,-0.081392,0.037147,0.025328,-0.037035,0.082239
81293,-0.091066,-0.021294,-0.044186,0.019165,-0.137924,-0.008029,0.048419,-0.079589,0.296037,-0.007221,-0.079076,-0.022675,0.021589,0.058957,0.058629,-0.067919,0.012645,0.012262,-0.068992,0.061169
81294,-0.049417,-0.081452,0.239921,0.004248,-0.067485,-0.025567,-0.008017,0.016983,-0.047561,0.020535,-0.032819,0.000908,0.037195,0.033035,-0.037894,0.034740,0.023072,-0.005460,-0.038729,-0.016371
