In [73]:
import regex as re
import pandas as pd
import os
import random
import numpy as np

#### 加载自制中文数据集、分词、去除停用词

In [74]:
data = pd.read_csv('smalldata_washed.csv')
rumor = data['rumor'].to_list()
reverse = data['reverse'].to_list()
rumor_class = len(rumor)*[1]
reverse_class = len(reverse)*[0]
data = rumor + reverse
data_class = rumor_class + reverse_class
#后续添加数据要放在raw目录下，使用.xlsx格式将后续标注的数据加入data
raw_data_list = os.listdir('raw')
data_list = []
good_name = re.compile(r'^(?!(\~\$)).*(.xlsx)')
for i in raw_data_list:
    if good_name.match(i):
    #    data_list.append(i)
        temp = pd.read_excel('raw/'+i)
        temp.fillna('',inplace=True)
        temp_rumor = [x.strip() for x in  temp['谣言'].to_list()  if x.strip()!='']
        temp_reverse = [x.strip() for x in  temp['真相'].to_list()  if x.strip()!='']
        temp_rumor_class = len(temp_rumor)*[1]
        temp_reverse_class = len(temp_reverse)*[0]
        temp_data = temp_rumor + temp_reverse
        temp_data_class = temp_rumor_class + temp_reverse_class
        for i in range( len(temp_data)):
            temp_data[i]=re.sub(r'[\,\'\ ]|(\n)','',temp_data[i])
    data = data + temp_data
    data_class = data_class + temp_data_class
with open('data.txt','w') as f:
    for i in range(len(data)):
        f.write(data[i]+'\t'+str(data_class[i])+'\n')
        data_list =list( zip(data, data_class))
random.shuffle(data_list)
data, data_class = zip(*data_list)
index = ['rumor{}{}'.format(i, '!' * j) for (i, j) in zip(range(len(data)), data_class)]
df = pd .DataFrame(data=zip(data_class,data), columns=["rumor","text"], index=index)

# 去除停用词
with open ('hit_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read().splitlines()

# jieba分词
import jieba
for i in range(len(df)):
    words = jieba.cut(df["text"].iloc[i],cut_all=False)
    words = [word for word in words if word not in stopwords]
    df["text"].iloc[i] = ' '.join(words) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"].iloc[i] = ' '.join(words)


In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=df.text).toarray()
len(tfidf.vocabulary_)



4021

In [76]:
tfidf_docs = pd.DataFrame(tfidf_docs)
#通过减去每个文档（词袋向量）的平均值来中心化文档
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape
df.rumor.sum()

330

#### 基于PCA的短消息语义分析

In [77]:
from sklearn.decomposition import PCA
pca = PCA(n_components=16)
pca_docs = pca.fit_transform(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca_docs.shape[1])]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=columns, index=index)
pca_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,...,topic12,topic13,topic14,topic15
rumor0,-0.011,-0.045,0.008,-0.027,...,0.014,-0.009,0.008,0.018
rumor1,-0.037,0.073,-0.029,0.035,...,-0.023,0.077,-0.018,-0.051
rumor2!,-0.036,-0.04,0.003,0.036,...,-0.04,0.022,-0.037,0.004
rumor3!,-0.021,-0.036,-0.001,-0.013,...,-0.042,0.034,0.025,0.025
rumor4,0.209,-0.022,0.102,-0.054,...,0.188,0.005,0.04,0.142
rumor5,0.114,0.178,-0.022,-0.027,...,0.034,0.071,-0.012,-0.084


In [78]:
tfidf.vocabulary_
#根据词项的频率对词汇表进行排序
#当对某个不按照最左边元素排序的序列解压并在排序后重新压缩时，可以使用zip(*sorted(zip(...)))
column_nums , terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))


In [79]:
weights = pd.DataFrame(pca.components_, columns=terms,
                       index = ['topic{}'.format(i) for i in range(pca.components_.shape[0])])
pd.options.display.max_columns = 8
weights.head(4).round(3)

Unnamed: 0,%,0,0t,1,...,鼓励,鼻出血,鼻翼,龋齿
topic0,-0.001,-0.001,-0.004,-0.003,...,-0.001,-0.005,-0.002,-0.003
topic1,-0.003,-0.0,-0.003,-0.011,...,-0.002,-0.008,-0.002,-0.004
topic2,-0.001,0.001,0.001,-0.008,...,-0.001,-0.001,0.001,-0.001
topic3,0.001,-0.0,0.003,0.007,...,-0.003,-0.0,0.001,0.003


#### 基于截断的SVD的短消息语义分析（recommend）

In [80]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=16,n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=columns, index=index)
svd_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,...,topic12,topic13,topic14,topic15
rumor0,-0.011,-0.045,0.008,-0.025,...,0.004,0.003,-0.014,0.013
rumor1,-0.038,0.072,-0.03,0.036,...,0.04,-0.064,-0.028,-0.049
rumor2!,-0.036,-0.041,-0.003,0.038,...,0.036,-0.04,0.009,-0.015
rumor3!,-0.02,-0.036,-0.003,-0.01,...,0.048,-0.019,0.018,0.019
rumor4,0.209,-0.023,0.094,-0.056,...,-0.14,-0.029,0.079,0.213
rumor5,0.114,0.178,-0.026,-0.027,...,-0.001,-0.071,0.008,-0.049


#### 基于LSA的垃圾短消息分类的效果

In [81]:
import numpy as np
svd_topic_vectors = (svd_topic_vectors.T / np.linalg.norm(svd_topic_vectors,axis=1)).T
svd_topic_vectors.iloc[:10].dot(svd_topic_vectors.iloc[:10].T).round(1)

Unnamed: 0,rumor0,rumor1,rumor2!,rumor3!,...,rumor6,rumor7,rumor8!,rumor9
rumor0,1.0,-0.4,0.2,0.4,...,0.1,0.1,0.0,-0.3
rumor1,-0.4,1.0,0.4,-0.1,...,0.3,-0.1,0.4,0.1
rumor2!,0.2,0.4,1.0,0.3,...,0.4,-0.6,0.5,-0.4
rumor3!,0.4,-0.1,0.3,1.0,...,0.1,-0.3,0.1,-0.1
rumor4,0.2,-0.4,-0.0,-0.1,...,-0.1,0.2,-0.3,-0.1
rumor5,-0.3,0.6,-0.1,-0.2,...,0.1,0.3,-0.0,0.3
rumor6,0.1,0.3,0.4,0.1,...,1.0,-0.4,0.4,-0.3
rumor7,0.1,-0.1,-0.6,-0.3,...,-0.4,1.0,-0.0,0.4
rumor8!,0.0,0.4,0.5,0.1,...,0.4,-0.0,1.0,-0.1
rumor9,-0.3,0.1,-0.4,-0.1,...,-0.3,0.4,-0.1,1.0
