In [1]:
import regex as re
import pandas as pd
import os
import random
import numpy as np
from imblearn.over_sampling import SMOTE

#### 加载自制中文数据集、分词、去除停用词

In [2]:
data = pd.read_csv('smalldata_washed.csv')
rumor = data['rumor'].to_list()
reverse = data['reverse'].to_list()
rumor_class = len(rumor)*[1]
reverse_class = len(reverse)*[0]
data = rumor + reverse
data_class = rumor_class + reverse_class
#后续添加数据要放在raw目录下，使用.xlsx格式将后续标注的数据加入data
raw_data_list = os.listdir('raw')
data_list = []
good_name = re.compile(r'^(?!(\~\$)).*(.xlsx)')
for i in raw_data_list:
    if good_name.match(i):
    #    data_list.append(i)
        temp = pd.read_excel('raw/'+i)
        temp.fillna('',inplace=True)
        temp_rumor = [x.strip() for x in  temp['谣言'].to_list()  if x.strip()!='']
        temp_reverse = [x.strip() for x in  temp['真相'].to_list()  if x.strip()!='']
        temp_rumor_class = len(temp_rumor)*[1]
        temp_reverse_class = len(temp_reverse)*[0]
        temp_data = temp_rumor + temp_reverse
        temp_data_class = temp_rumor_class + temp_reverse_class
        for i in range( len(temp_data)):
            temp_data[i]=re.sub(r'[\,\'\ ]|(\n)','',temp_data[i])
    data = data + temp_data
    data_class = data_class + temp_data_class
with open('data.txt','w') as f:
    for i in range(len(data)):
        f.write(data[i]+'\t'+str(data_class[i])+'\n')
        data_list =list( zip(data, data_class))
random.shuffle(data_list)
data, data_class = zip(*data_list)
index = ['rumor{}{}'.format(i, '!' * j) for (i, j) in zip(range(len(data)), data_class)]
df = pd .DataFrame(data=zip(data_class,data), columns=["rumor","text"], index=index)

# 去除停用词
with open ('hit_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = f.read().splitlines()

# jieba分词
import jieba
for i in range(len(df)):
    words = jieba.cut(df["text"].iloc[i],cut_all=False)
    words = [word for word in words if word not in stopwords]
    df["text"].iloc[i] = ' '.join(words) 

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/f0/6j6j3w814n53skbt2s4d8p5m0000gn/T/jieba.cache
Loading model cost 0.282 seconds.
Prefix dict has been built successfully.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["text"].iloc[i] = ' '.join(words)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=df.text).toarray()
len(tfidf.vocabulary_)



4021

In [4]:
tfidf_docs = pd.DataFrame(tfidf_docs)
#通过减去每个文档（词袋向量）的平均值来中心化文档
tfidf_docs = tfidf_docs - tfidf_docs.mean()
tfidf_docs.shape
df.rumor.sum()

330

#### 基于PCA的短消息语义分析

In [5]:
from sklearn.decomposition import PCA
pca = PCA(n_components=16)
pca_docs = pca.fit_transform(tfidf_docs)
pca_topic_vectors = pca.transform(tfidf_docs)
columns = ['topic{}'.format(i) for i in range(pca_docs.shape[1])]
pca_topic_vectors = pd.DataFrame(pca_topic_vectors, columns=columns, index=index)
pca_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
rumor0!,0.044,-0.084,-0.001,0.041,0.003,0.05,-0.063,-0.061,0.078,-0.04,0.146,-0.126,-0.004,0.026,0.008,0.07
rumor1!,0.144,-0.047,0.054,0.057,-0.032,0.022,-0.008,-0.036,0.002,-0.025,0.029,-0.067,-0.005,0.015,0.036,0.003
rumor2,-0.083,-0.073,0.09,-0.019,-0.122,0.299,-0.086,0.149,0.186,-0.203,-0.161,0.248,-0.237,-0.141,0.018,0.026
rumor3!,0.032,-0.117,-0.14,-0.257,-0.167,-0.165,-0.002,0.087,0.089,-0.062,-0.035,-0.084,-0.012,0.127,0.017,0.004
rumor4,-0.062,-0.041,0.054,-0.018,-0.008,-0.015,-0.008,-0.022,-0.014,-0.064,0.003,0.056,-0.004,-0.039,0.021,-0.001
rumor5!,-0.015,-0.037,-0.009,0.005,-0.016,0.029,0.01,-0.013,-0.022,0.024,0.007,0.007,0.001,-0.017,0.003,-0.022


In [6]:
tfidf.vocabulary_
#根据词项的频率对词汇表进行排序
#当对某个不按照最左边元素排序的序列解压并在排序后重新压缩时，可以使用zip(*sorted(zip(...)))
column_nums , terms = zip(*sorted(zip(tfidf.vocabulary_.values(), tfidf.vocabulary_.keys())))


In [7]:
weights = pd.DataFrame(pca.components_, columns=terms,
                       index = ['topic{}'.format(i) for i in range(pca.components_.shape[0])])
pd.options.display.max_columns = 8
weights.head(4).round(3)

Unnamed: 0,%,0,0t,1,...,鼓励,鼻出血,鼻翼,龋齿
topic0,-0.001,-0.001,-0.004,-0.003,...,-0.001,-0.004,-0.002,-0.003
topic1,-0.003,-0.0,-0.002,-0.011,...,-0.002,-0.008,-0.002,-0.004
topic2,-0.001,0.001,0.001,-0.008,...,-0.001,-0.0,0.0,-0.001
topic3,0.002,-0.0,0.003,0.007,...,-0.003,0.001,0.001,0.003


#### 基于截断的SVD的短消息语义分析（recommend）

In [8]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=16,n_iter=100)
svd_topic_vectors = svd.fit_transform(tfidf_docs.values)
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=columns, index=index)
svd_topic_vectors.round(3).head(6)

Unnamed: 0,topic0,topic1,topic2,topic3,...,topic12,topic13,topic14,topic15
rumor0!,0.044,-0.084,-0.006,0.042,...,0.026,-0.013,0.019,0.02
rumor1!,0.144,-0.047,0.05,0.055,...,-0.002,0.004,0.03,0.02
rumor2,-0.082,-0.075,0.087,-0.025,...,-0.209,-0.2,0.031,-0.004
rumor3!,0.032,-0.117,-0.145,-0.261,...,-0.027,0.068,-0.021,0.039
rumor4,-0.062,-0.041,0.056,-0.02,...,-0.002,-0.024,0.021,-0.044
rumor5!,-0.015,-0.037,-0.009,0.004,...,0.008,-0.005,0.006,0.001


#### 基于LSA的垃圾短消息分类的效果

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(svd_topic_vectors,df.rumor,test_size=0.5,random_state=42)
smote = SMOTE(random_state=42)
# 进行过采样
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("过采样前训练集中类别0的数量：", sum(y_train==0))
print("过采样前训练集中类别1的数量：", sum(y_train==1))

print("过采样后训练集中类别0的数量：", sum(y_train_res==0))
print("过采样后训练集中类别1的数量：", sum(y_train_res==1))
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
df['LSA16_rumor'] = lda.predict(svd_topic_vectors)
round(float(lda.score(X_test, y_test)), 3)

过采样前训练集中类别0的数量： 231
过采样前训练集中类别1的数量： 168
过采样后训练集中类别0的数量： 231
过采样后训练集中类别1的数量： 231


0.69

In [10]:
import numpy as np
svd_topic_vectors = (svd_topic_vectors.T / np.linalg.norm(svd_topic_vectors,axis=1)).T
svd_topic_vectors.iloc[:10].dot(svd_topic_vectors.iloc[:10].T).round(1)

Unnamed: 0,rumor0!,rumor1!,rumor2,rumor3!,...,rumor6,rumor7,rumor8,rumor9
rumor0!,1.0,0.6,-0.2,0.0,...,-0.4,-0.5,-0.2,-0.1
rumor1!,0.6,1.0,-0.1,-0.1,...,0.0,-0.5,0.2,-0.2
rumor2,-0.2,-0.1,1.0,-0.1,...,-0.1,-0.2,0.0,0.0
rumor3!,0.0,-0.1,-0.1,1.0,...,-0.2,-0.1,-0.1,0.3
rumor4,-0.1,-0.3,0.5,-0.1,...,0.0,0.1,-0.1,0.5
rumor5!,0.1,0.0,0.1,-0.1,...,-0.5,-0.1,-0.1,-0.1
rumor6,-0.4,0.0,-0.1,-0.2,...,1.0,0.4,0.3,0.3
rumor7,-0.5,-0.5,-0.2,-0.1,...,0.4,1.0,-0.0,0.2
rumor8,-0.2,0.2,0.0,-0.1,...,0.3,-0.0,1.0,0.0
rumor9,-0.1,-0.2,0.0,0.3,...,0.3,0.2,0.0,1.0
