引入需要的库

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA


1. 将服务器端每一条新闻转化成txt格式存储

In [None]:
dirs = "./server_files"
if not os.path.exists(dirs):
    os.mkdir(dirs)

passage_dirs = "./server_files/news"
if not os.path.exists(passage_dirs):
    os.mkdir(passage_dirs)

df = pd.read_csv("./data/all_news.csv")
for item in df.iloc():
    with open("{}/{}.txt".format(passage_dirs, item.id), "w") as f:
        f.write(item.title)
        f.write("\n")
        f.write(item.body)


读取数据集，提取词根构建词典，保存词频信息

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer
import string
import json

lemmatizer = WordNetLemmatizer()
dic = {}
vocab = []

for i in df.body:
    i = i.strip().translate(str.maketrans('', '', string.punctuation))
    tokenize = nltk.word_tokenize(i)
    for word in tokenize:
        lemma = lemmatizer.lemmatize(word.lower())
        if lemma in dic.keys():
            dic[lemma] += 1
        else:
            dic[lemma] = 1
            vocab.append(lemma)

dic = sorted(dic.items(), key=lambda x: x[1], reverse=True)

json_dic = json.dumps(dict(dic))
with open("./server_files/vocab.json", "w") as f:
    f.write(json_dic)


构建TF-IDF向量

In [5]:
df = pd.read_csv("./data/all_news.csv")

split_words = df['body'].str.split(
    ' ', expand=True).stack().rename('word').reset_index()
new_data = pd.merge(df['title'], split_words,
                    left_index=True, right_on='level_0')

def tf(x):
    t = dict(x['word'].value_counts()/x['word'].value_counts().sum())
    return t


data_tf = new_data.groupby("title").apply(tf)
new_data['TF'] = new_data.apply(lambda x: data_tf[x["title"]][x['word']], axis=1)


def idf(x):
    return np.log(df["title"].nunique()/x['title'].nunique())


data_idf = dict(new_data.groupby('word').apply(lambda x: idf(x)))

new_data['IDF'] = new_data.apply(lambda x: data_idf[x['word']], axis=1)
new_data['TF-IDF'] = new_data.apply(lambda x: x['TF']*x['IDF'], axis=1)
new_data


2096


Unnamed: 0,title,level_0,level_1,word,TF,IDF,TF-IDF
0,Ad sales boost Time Warner profit,0,0,Quarterly,0.002433,6.954639,0.016921
1,Ad sales boost Time Warner profit,0,1,profits,0.012165,3.504651,0.042636
2,Ad sales boost Time Warner profit,0,2,at,0.007299,0.250225,0.001826
3,Ad sales boost Time Warner profit,0,3,US,0.007299,1.386294,0.010119
4,Ad sales boost Time Warner profit,0,4,media,0.002433,2.772589,0.006746
...,...,...,...,...,...,...,...
834792,Losing yourself in online gaming,2224,2917,was,0.007187,0.237439,0.001706
834793,Losing yourself in online gaming,2224,2918,the,0.035250,0.000000,0.000000
834794,Losing yourself in online gaming,2224,2919,days,0.001369,2.529792,0.003463
834795,Losing yourself in online gaming,2224,2920,!,0.000342,7.647786,0.002617


计算文档向量

In [13]:
max_len = 0
max_idx = 0
for idx, i in enumerate(df.body):
    word_len = i.count(" ")+1
    if word_len > max_len:
        max_len = word_len
        max_idx = idx
max_len, max_idx


AttributeError: 'DataFrame' object has no attribute 'body'

In [4]:
feats = []
group = list(new_data.groupby("title"))
print(len(group))

for item in group:
    feat = np.array(item[1]["TF-IDF"])
    feat = np.pad(feat, (0, max_len-feat.shape[0]), 'constant', constant_values=(0,0))
    feats.append(feat)

feats = np.vstack(feats)
print(feats.shape)

pca = PCA(n_components=1000)
pca.fit(feats)
feats_new = pca.transform(feats)
print(feats_new.shape)


2096
(2096, 4374)
(2096, 1000)
