In [25]:
import pandas as pd
from nlpia.data.loaders import get_data
pd.options.display.max_columns = 120

In [26]:
sms = get_data('sms-spam')
index = ['sms{}{}'.format(i, '!' * j) for (i, j) in zip(range(len(sms)), sms.spam)]
sms.index = index
sms.head(6)
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import casual_tokenize
tfidf = TfidfVectorizer(tokenizer=casual_tokenize)
tfidf_docs = tfidf.fit_transform(raw_documents=sms.text).toarray()
len(tfidf.vocabulary_)



9232

#### 隐形狄利克雷分布原理
- 设想一台自动生成文档的机器，只有两个选项来控制生成文档的两个属性
    - 生成文档的词的数量（泊松分布）
    - 文档中混合的主题的数量（狄利克雷分布）
- 除此之外，最关键的问题在于确定一个主题-词项矩阵，该矩阵表示了每个词对主题的贡献权重。一但能够确定这个矩阵，机器就可以在选择好的主题上反复迭代选择词，直到生成一篇足够长度的文档。
- 回到对现有文档估计主题的问题上来，LDiA可以用于关于词和主题的参数，Blei和Ng通过分析语料库中文档的统计数据确定两个参数
    - 第一个参数可以通过计算语料库的平均词（n-gram）数量、
    - 第二个参数更棘手，必须先猜测有几个主题数再为主题分配语词，最后优化目标函数
-  “LSA试图将原本分散的东西分散开，LDiA试图将原本接近的东西接近在一起”

In [27]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import casual_tokenize
np.random.seed(42)

counter = CountVectorizer(tokenizer=casual_tokenize)
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=sms.text)
                        .toarray(), index=index)
column_nums, terms = zip(*sorted(zip(counter.vocabulary_.values(),
                                     counter.vocabulary_.keys())))
bow_docs.columns = terms

In [28]:
sms.loc['sms0'].text

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [29]:
bow_docs.loc['sms0'][bow_docs.loc['sms0'] > 0].head()

,            1
..           1
...          2
amore        1
available    1
Name: sms0, dtype: int64

In [30]:
from sklearn.decomposition import LatentDirichletAllocation as LDiA
ldia = LDiA(n_components=16, learning_method='batch')
ldia = ldia.fit(bow_docs)  # <1>
ldia.components_.shape

(16, 9232)

In [31]:
columns = ['topic{}'.format(i) for i in range(16)]
pd.set_option('display.width', 2)
componets = pd.DataFrame(ldia.components_.T,index = terms ,columns=columns)
componets.round(2).head(3)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
!,184.03,15.0,72.22,394.95,45.48,36.14,9.55,44.81,0.43,90.23,37.42,44.18,64.4,297.29,41.16,11.7
"""",0.68,4.22,2.41,0.06,152.35,0.06,0.06,0.06,0.45,0.68,8.42,11.42,0.07,62.72,12.27,0.06
#,0.06,0.06,0.06,0.06,0.06,2.07,0.06,0.06,0.06,0.06,0.06,0.06,1.07,4.05,0.06,0.06


In [32]:
componets.topic3.sort_values(axis=0, ascending=False).head(10)

!       394.952246
.       218.049724
to      119.533134
u       118.857546
call    111.948541
£       107.358914
,        96.954384
*        90.314783
your     90.215961
is       75.750037
Name: topic3, dtype: float64

In [33]:
ldia16_topic_vectors = ldia.transform(bow_docs)
ldia16_topic_vectors = pd.DataFrame(ldia16_topic_vectors,
                                    index=index, columns=columns)
ldia16_topic_vectors.round(2).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.0,0.62,0.0,0.0,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sms1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.78,0.01,0.01,0.12,0.01,0.01,0.01,0.01
sms2!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0
sms3,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sms4,0.39,0.0,0.33,0.0,0.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0


#### 使用LDiA主题向量训练LDA模型

In [34]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(ldia16_topic_vectors,sms.spam,test_size=0.5,random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
sms['ldia16_spam'] = lda.predict(ldia16_topic_vectors)
round(float(lda.score(X_test, y_test)), 3)

0.94

In [35]:
ldia32_topic_vectors = ldia.transform(bow_docs)
ldia32_topic_vectors = pd.DataFrame(ldia32_topic_vectors,
                                    index=index, columns=columns)
ldia32_topic_vectors.round(2).head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,topic10,topic11,topic12,topic13,topic14,topic15
sms0,0.0,0.62,0.0,0.0,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sms1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.78,0.01,0.01,0.12,0.01,0.01,0.01,0.01
sms2!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0
sms3,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sms4,0.39,0.0,0.33,0.0,0.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0


In [36]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(ldia32_topic_vectors,sms.spam,test_size=0.5,random_state=271828)
lda = LDA(n_components=1)
lda = lda.fit(X_train, y_train)
sms['ldia32_spam'] = lda.predict(ldia32_topic_vectors)
round(float(lda.score(X_test, y_test)), 3)

0.94