引入需要的库

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

import nltk
from nltk.stem import WordNetLemmatizer
import string
import json


### 1. 将服务器端每一条新闻转化成txt格式存储

In [2]:
dirs = "./server_files"
if not os.path.exists(dirs):
    os.mkdir(dirs)

passage_dirs = "./server_files/news"
if not os.path.exists(passage_dirs):
    os.mkdir(passage_dirs)

df = pd.read_csv("./data/all_news.csv")
for item in df.iloc():
    with open("{}/{}.txt".format(passage_dirs, item.id), "w") as f:
        f.write(item.title)
        f.write("\n")
        f.write(item.body)


### 2. 读取数据集，提取词根构建词典

In [3]:
lemmatizer = WordNetLemmatizer()
vocab = []

df = pd.read_csv("./data/all_news.csv")
for i in df.body:
    i = i.strip().translate(str.maketrans('', '', string.punctuation))
    tokenize = nltk.word_tokenize(i)
    for word in tokenize:
        lemma = lemmatizer.lemmatize(word.lower())
        if lemma not in vocab:
            vocab.append(lemma)

pd_vocab = pd.Series(vocab)
pd_vocab.to_csv("./server_files/vocab.csv")


### 3. 构建TF-IDF向量

In [4]:
df = pd.read_csv("./data/all_news.csv")

split_words = df['body'].str.split(
    ' ', expand=True).stack().rename('word').reset_index()
new_data = pd.merge(df['title'], split_words,
                    left_index=True, right_on='level_0')

def tf(x):
    t = dict(x['word'].value_counts()/x['word'].value_counts().sum())
    return t


data_tf = new_data.groupby("title").apply(tf)
new_data['TF'] = new_data.apply(lambda x: data_tf[x["title"]][x['word']], axis=1)


def idf(x):
    return np.log(df["title"].nunique()/x['title'].nunique())


data_idf = dict(new_data.groupby('word').apply(lambda x: idf(x)))

new_data['IDF'] = new_data.apply(lambda x: data_idf[x['word']], axis=1)
new_data['TF-IDF'] = new_data.apply(lambda x: x['TF']*x['IDF'], axis=1)
new_data


Unnamed: 0,title,level_0,level_1,word,TF,IDF,TF-IDF
0,Ad sales boost Time Warner profit,0,0,Quarterly,0.002433,6.954639,0.016921
1,Ad sales boost Time Warner profit,0,1,profits,0.012165,3.504651,0.042636
2,Ad sales boost Time Warner profit,0,2,at,0.007299,0.250225,0.001826
3,Ad sales boost Time Warner profit,0,3,US,0.007299,1.386294,0.010119
4,Ad sales boost Time Warner profit,0,4,media,0.002433,2.772589,0.006746
...,...,...,...,...,...,...,...
834792,Losing yourself in online gaming,2224,2917,was,0.007187,0.237439,0.001706
834793,Losing yourself in online gaming,2224,2918,the,0.035250,0.000000,0.000000
834794,Losing yourself in online gaming,2224,2919,days,0.001369,2.529792,0.003463
834795,Losing yourself in online gaming,2224,2920,!,0.000342,7.647786,0.002617


### 4. 计算文档向量

利用tf-idf矩阵矩阵的每一行作为文章向量，并利用PCA降维

In [4]:
max_len = 0
for i in df.body:
    word_len = i.count(" ")+1
    if word_len > max_len:
        max_len = word_len
print(max_len)

4374


In [5]:
def vec(x):
    vec_x = np.zeros(len(vocab))
    for i in range(x.shape[0]):
        if x['word'].iloc[i] in vocab:
            vec_x[vocab.index(x['word'].iloc[i])] = x['TF-IDF'].iloc[i]
    return vec_x


data_vec = dict(new_data.groupby('level_0').apply(lambda x: vec(x)))
new_data['news_vec'] = new_data.apply(lambda x: data_vec[x['level_0']], axis=1)

new_data

30897


Unnamed: 0,title,level_0,level_1,word,TF,IDF,TF-IDF,news_vec
0,Ad sales boost Time Warner profit,0,0,Quarterly,0.002433,6.954639,0.016921,"[0.0, 0.03579069714733636, 0.00182645627676555..."
1,Ad sales boost Time Warner profit,0,1,profits,0.012165,3.504651,0.042636,"[0.0, 0.03579069714733636, 0.00182645627676555..."
2,Ad sales boost Time Warner profit,0,2,at,0.007299,0.250225,0.001826,"[0.0, 0.03579069714733636, 0.00182645627676555..."
3,Ad sales boost Time Warner profit,0,3,US,0.007299,1.386294,0.010119,"[0.0, 0.03579069714733636, 0.00182645627676555..."
4,Ad sales boost Time Warner profit,0,4,media,0.002433,2.772589,0.006746,"[0.0, 0.03579069714733636, 0.00182645627676555..."
...,...,...,...,...,...,...,...,...
834792,Losing yourself in online gaming,2224,2917,was,0.007187,0.237439,0.001706,"[0.0, 0.0, 0.001455789414300811, 0.0, 0.0, 0.0..."
834793,Losing yourself in online gaming,2224,2918,the,0.035250,0.000000,0.000000,"[0.0, 0.0, 0.001455789414300811, 0.0, 0.0, 0.0..."
834794,Losing yourself in online gaming,2224,2919,days,0.001369,2.529792,0.003463,"[0.0, 0.0, 0.001455789414300811, 0.0, 0.0, 0.0..."
834795,Losing yourself in online gaming,2224,2920,!,0.000342,7.647786,0.002617,"[0.0, 0.0, 0.001455789414300811, 0.0, 0.0, 0.0..."


In [18]:
ori_feats = new_data["news_vec"]
ori_feats = np.vstack(data_vec.values())
ori_feats.shape

  ori_feats = np.vstack(data_vec.values())


(2225, 30897)

对原始的文档向量降维并保存（本项目中保存1000维文档向量）

In [19]:
pca = PCA(n_components=1000)
pca.fit(ori_feats)
feats_new = pca.transform(ori_feats)

np.save("./server_files/file_feats_1000.npy", feats_new)


### 5. 计算文档相似度

根据文档向量计算相似度

In [25]:
def similarity(x):
    sim = []
    for item in x:
        sim.append(item/np.sqrt(np.sum(item**2)))
    sim = np.vstack(sim)
    return sim @ sim.T


pd_sim = pd.DataFrame(similarity(new_data.groupby('level_0').head(1)['news_vec']), index=new_data['level_0'].unique(),
             columns=new_data['level_0'].unique())
np.save("./server_files/similarity.npy", np.array(pd_sim))


### 6. 计算词向量
   
通过词词共现矩阵得到词语关联，并降维得到词向量

In [25]:
from tqdm import trange

n_vocab = len(vocab)
word_matrix = np.eye(n_vocab)
print(word_matrix.shape)

for item in trange(len(df["body"])):
    item = df["body"][item].strip().translate(str.maketrans('', '', string.punctuation))
    tokenize = nltk.word_tokenize(item)
    word_id = []
    for word in tokenize:
        lemma = lemmatizer.lemmatize(word.lower())
        word_id.append(vocab.index(lemma))
    for i in range(len(word_id)):
        for j in range(i+1, len(word_id)):
            word_matrix[word_id[i]][word_id[j]] += 1
            word_matrix[word_id[j]][word_id[i]] += 1


(30897, 30897)


100%|██████████| 2225/2225 [06:51<00:00,  5.40it/s]


In [28]:
pca2 = PCA(n_components=256)
pca2.fit(word_matrix)
word_vec = pca2.transform(word_matrix)
word_vec.shape


(30897, 256)

In [29]:
np.save("./server_files/word_vec.npy", word_vec)

### 7. 计算词之间的相似度，并得到相似词

为每个单词保存5个相似词

In [33]:
def similarity(x):
    sim = []
    for item in x:
        sim.append(item/np.sqrt(np.sum(item**2)))
    sim = np.vstack(sim)
    return sim @ sim.T


word_sim = similarity(word_vec)

sim_word = {}
for i in trange(n_vocab):
    sim = np.argsort(-word_sim[i])
    temp_word = []
    for j in range(1, 6):
        temp_word.append(vocab[sim[j]])
    sim_word[vocab[i]] = temp_word
sim_word


100%|██████████| 30897/30897 [01:22<00:00, 376.59it/s]


{'quarterly': ['healthy', 'slowed', 'yearonyear', 'weakening', 'buoyant'],
 'profit': ['revenue', 'earnings', 'sale', 'share', 'euro'],
 'at': ['in', 'the', 'from', 'on', 'with'],
 'u': ['however', 'in', 'by', 'year', 'which'],
 'medium': ['such', 'are', 'more', 'together', 'make'],
 'giant': ['company', 'rival', 'firm', 'itself', 'product'],
 'timewarner': ['fullyear', 'restate', 'yearearlier', 'aols', 'adjust'],
 'jumped': ['girl', 'sister', 'performed', 'tom', 'nick'],
 '76': ['63', '62', 'federer', 'seed', 'semifinal'],
 'to': ['and', 'on', 'a', 'of', 'for'],
 '113bn': ['aols', 'yearearlier', '143m', '186m', '767'],
 '£600m': ['336bn', 'searchengine', '464000', '109bn', '639m'],
 'for': ['and', 'a', 'an', 'to', 'on'],
 'the': ['of', 'on', 'by', 'from', 'in'],
 'three': ['two', 'four', 'first', 'while', 'despite'],
 'month': ['in', 'recent', 'high', 'however', 'by'],
 'december': ['fall', 'november', 'annual', '2004', 'strong'],
 'from': ['the', 'on', 'a', 'and', 'of'],
 '639m': ['3

In [None]:
json_simword = json.dumps(sim_word)
with open("./server_files/synonym.json", "w") as f:
    f.write(json_simword)