In [29]:
import pandas as pd
from nlpia.data.loaders import get_data

#### 从nlpia包中导入垃圾短消息数据集sms-spam

In [30]:
pd .options.display.width = 120
sms = get_data('sms-spam')
index = ['sms{}{}'.format(i, '!' * j) for (i, j) in zip(range(len(sms)), sms.spam)]
sms = pd .DataFrame(sms.values, columns=sms.columns, index=index)
sms['spam'] = sms.spam.astype(int)

In [31]:
len(sms)

4837

In [32]:
sms.spam.sum()

638

In [33]:
sms.head(6)

Unnamed: 0,spam,text
sms0,0,"Go until jurong point, crazy.. Available only ..."
sms1,0,Ok lar... Joking wif u oni...
sms2!,1,Free entry in 2 a wkly comp to win FA Cup fina...
sms3,0,U dun say so early hor... U c already then say...
sms4,0,"Nah I don't think he goes to usf, he lives aro..."
sms5!,1,FreeMsg Hey there darling it's been 3 week's n...


#### 使用sklearn计算TF-IDF

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize

In [35]:
tfidf_model = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf_model.fit_transform(raw_documents=sms.text).toarray()#这句话计算了tfidf
tfidf_docs.shape



(4837, 9232)

In [36]:
sms.spam.sum()

638

#### 使用LDA语义分析技术进行聚类

In [37]:
msak = sms.spam.astype(bool).values
spam_centroid = tfidf_docs[msak].mean(axis=0)
ham_centroid = tfidf_docs[~msak].mean(axis=0)

In [38]:
spam_centroid.round(2)

array([0.06, 0.  , 0.  , ..., 0.  , 0.  , 0.  ])

In [39]:
ham_centroid.round(2)

array([0.02, 0.01, 0.  , ..., 0.  , 0.  , 0.  ])

In [40]:
spamminess_score = tfidf_docs.dot(spam_centroid - ham_centroid)
spamminess_score.round(2)

array([-0.01, -0.02,  0.04, ..., -0.01, -0.  ,  0.  ])

#### 使用LDA模型预测
-  lda_score 大于0.5的认为是谣言

In [41]:
from sklearn.preprocessing import MinMaxScaler
sms['lda_score'] = MinMaxScaler().fit_transform(spamminess_score.reshape(-1, 1))
sms['lda_pred'] = (sms.lda_score > .5).astype(int)
sms['spam lda_pred lda_score'.split()].round(2).head(6)

Unnamed: 0,spam,lda_pred,lda_score
sms0,0,0,0.23
sms1,0,0,0.18
sms2!,1,1,0.72
sms3,0,0,0.18
sms4,0,0,0.29
sms5!,1,1,0.55


In [42]:
(1. - (sms.spam - sms.lda_pred).abs().sum() / len(sms)).round(4)

0.9775