# LSI text preprocessing example

In [1]:
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
from Common.DataCenter import data_center
import pandas as pd

首先给出官方文档中的例子

In [2]:
model = LsiModel(common_corpus, id2word=common_dictionary)
vectorized_corpus = model[common_corpus]

非常简单粗暴，传入corpus，以及传入字典，就能构造出LSA模型。得到模型以后，传入了corpus，就能得到向量。下面我们从头构造，来了解一下它是怎么运作的。

举例： 如何构造corpus

In [3]:
from collections import defaultdict
from gensim import corpora
# 这个就是文档库，是个字符串数组，列表中的每个元素是个字符串
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]
# 将文档中的所有单词转换为小写，移除刁stop words，并按空格分割
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]
# 这里删除罕见词，将只出现过1次的词删掉
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]
# 将文档库的所有单词存入字典，内部自动赋予ID
dictionary = corpora.Dictionary(texts)
# 重新将文档库进行doc2bow，每篇文章由单词数组，转换为了[(id1, number),(id2, number), ... ]的形式，每个独一无二的单词对应一个id
corpus = [dictionary.doc2bow(text) for text in texts]

doc2bow可以将文档转换为内部定义的数字token

In [4]:
dictionary.doc2bow(['survey', 'graph', 'graph'])

[(4, 1), (10, 2)]

这里的4代表survey，后面的数字1代表survey出现了一次；10代表graph，后面的2代表出现了2次。  
总结一下，我们的文档库是用列表的形式表示，列表的一个元素是一个字符串。我们手工将字符串转换成小写以后，通过空格进行分割，这样每个文档就对应了个字符串的数组，每个数组元素就是一个单词。然后通过Dictionary，将这个二级数组转换为dictionary，这里的dictionary将所有单词存起来，每个单词指定了一个索引。  
接下来对于文档库的每一个文档，用doc2bow函数，转换为[(id1, number), (id2, number), ...]的形式。  
现在使用LsiModel

In [5]:
testModel = LsiModel(corpus=corpus, id2word=dictionary)
vec_corpus = model[corpus]

把topic按照重要程度列出来，每个默认列前10个单词

In [6]:
testModel.show_topics()

[(0,
  '0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"'),
 (1,
  '0.623*"graph" + 0.490*"trees" + 0.451*"minors" + 0.274*"survey" + -0.167*"system" + -0.141*"eps" + -0.113*"human" + 0.107*"response" + 0.107*"time" + -0.072*"interface"'),
 (2,
  '0.426*"response" + 0.426*"time" + -0.361*"system" + 0.338*"user" + -0.330*"eps" + -0.289*"human" + -0.231*"trees" + -0.223*"graph" + 0.178*"survey" + 0.164*"computer"'),
 (3,
  '0.595*"computer" + 0.552*"interface" + 0.415*"human" + -0.333*"system" + -0.188*"eps" + -0.099*"user" + -0.074*"response" + -0.074*"time" + 0.032*"survey" + -0.025*"trees"'),
 (4,
  '0.594*"trees" + -0.537*"survey" + 0.332*"user" + -0.300*"minors" + 0.282*"interface" + -0.159*"system" + 0.115*"eps" + -0.107*"computer" + -0.106*"human" + 0.080*"time"'),
 (5,
  '0.496*"interface" + -0.392*"trees" + 0.385*"user" + -0.341*"human" + 0.277*"minors" + 0.272*"e

下面给定一句话(query)，将其转换到latent space

In [7]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = testModel[vec_bow]  # convert the query to LSI space
print(vec_lsi)
testModel[corpus[0]]

[(0, 0.4618210045327156), (1, -0.07002766527900064), (2, -0.1245290755189914), (3, 1.0097125584438542), (4, -0.21303040605626514), (5, -0.5959384533820675), (6, -0.22041753546094384), (7, -0.001877877355475073), (8, 0.08576685494995573)]


[(0, 0.6594664059797393),
 (1, -0.14211544403729992),
 (2, -0.25956871420842187),
 (3, 1.561952142099364),
 (4, 0.06873853289228085),
 (5, -0.10006044227146021),
 (6, -0.14999409428716526),
 (7, 0.008062159852297848),
 (8, -0.02316341061634601)]

我们能看出，它给出的格式是一个tuple列表，tuple的第一个元素代表topic id，第二个元素就代表在该topic下的取值。从而得到latent space下的向量。  
下面使用data center中的数据，训练LSA，然后算出latent space下的vector

In [8]:
def dc_format(D):
    data = {'message':D[0] , 'sentiment':D[1]}
    df = pd.DataFrame(data)
    return df

In [9]:
# 加载测试集和验证集
dc = data_center('./twitter_sentiment_data.csv', test_size=8000, noisy_size=8000, validation_size=5000)
test_df = dc_format(dc.get_test())
val_df = dc_format(dc.get_validation())

print(f"Test size: {test_df.shape[0]}")
print(f"Validation size: {val_df.shape[0]}")

Test size: 8000
Validation size: 5000


In [10]:
# 加载训练集
train_df_2000 = dc_format(dc.get_train(2000))
train_df_2500 = dc_format(dc.get_train(2500))
train_df_4000 = dc_format(dc.get_train(4000))
train_df_5000 = dc_format(dc.get_train(5000))
train_df_7500 = dc_format(dc.get_train(7500))
train_df_10000 = dc_format(dc.get_train(10000))

只用train_df_2000做测试，message那一列就是文本数据

In [11]:
# 把message列转为list
trainDocs = train_df_2000['message'].tolist()

In [12]:
# 将文档中的所有单词转换为小写，移除掉stop words，并按空格分割
# remove common words and tokenize
def Document2Corpus(documents):    
    stoplist = set('for a of the and to in'.split())
    texts = [
        [word for word in document.lower().split() if word not in stoplist]
        for document in documents
    ]
    # 这里删除罕见词，将只出现过1次的词删掉
    # remove words that appear only once
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [
        [token for token in text if frequency[token] > 1]
        for text in texts
    ]
    # 将文档库的所有单词存入字典，内部自动赋予ID
    dictionary = corpora.Dictionary(texts)
    # 重新将文档库进行doc2bow，每篇文章由单词数组，转换为了[(id1, number),(id2, number), ... ]的形式，每个独一无二的单词对应一个id
    corpus = [dictionary.doc2bow(text) for text in texts]
    return (dictionary, corpus)

现在根据文档库，构造dictionary和corpus

In [13]:
trainDict, trainCorpus = Document2Corpus(trainDocs)

构造LSI model

In [14]:
# 构造LSI model
lsiModel = LsiModel(corpus=trainCorpus, id2word=trainDict, num_topics=20)


In [15]:
# 测试
testStr = "Hello world"
query = trainDict.doc2bow(testStr.lower().split())
queryLsi = lsiModel[query]  # convert the query to LSI space
queryLsi

[(0, 0.015947732435792686),
 (1, -0.0032388213615168614),
 (2, 0.006381591182950672),
 (3, 0.02028579259522966),
 (4, 0.002988925702525963),
 (5, 0.005793669219836691),
 (6, 0.020151713433651453),
 (7, 0.006790440958006139),
 (8, -0.005148689615160713),
 (9, 0.010112700688297624),
 (10, 0.0056909523299113075),
 (11, 0.003550866501387658),
 (12, -0.0011754495761561506),
 (13, 0.0020487974261467275),
 (14, 0.02114851858916627),
 (15, -0.02434774221721605),
 (16, -0.006876773270630155),
 (17, 0.010336101640595164),
 (18, 0.021279373888977382),
 (19, 0.021126892219531283)]

In [16]:
df = pd.DataFrame(queryLsi, columns=['index','vec'])
queryVec = df['vec'].tolist()
queryVec

[0.015947732435792686,
 -0.0032388213615168614,
 0.006381591182950672,
 0.02028579259522966,
 0.002988925702525963,
 0.005793669219836691,
 0.020151713433651453,
 0.006790440958006139,
 -0.005148689615160713,
 0.010112700688297624,
 0.0056909523299113075,
 0.003550866501387658,
 -0.0011754495761561506,
 0.0020487974261467275,
 0.02114851858916627,
 -0.02434774221721605,
 -0.006876773270630155,
 0.010336101640595164,
 0.021279373888977382,
 0.021126892219531283]

下面看sklearn所带的LSA

In [17]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
import numpy as np

首先可以构造document term matrix(dtm)

In [18]:
example = ["Machine learning is super fun",
"Python is super, super cool",
"Statistics is cool, too",
"Data science is fun",
"Python is great for machine learning",
"I like football",
"Football is great to watch"]
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
dtm = vectorizer.fit_transform(example)
pd.DataFrame(dtm.toarray(),index=example,columns=vectorizer.get_feature_names_out()).head(10)

Unnamed: 0,cool,data,football,fun,great,learning,like,machine,python,science,statistics,super,watch
Machine learning is super fun,0,0,0,1,0,1,0,1,0,0,0,1,0
"Python is super, super cool",1,0,0,0,0,0,0,0,1,0,0,2,0
"Statistics is cool, too",1,0,0,0,0,0,0,0,0,0,1,0,0
Data science is fun,0,1,0,1,0,0,0,0,0,1,0,0,0
Python is great for machine learning,0,0,0,0,1,1,0,1,1,0,0,0,0
I like football,0,0,1,0,0,0,1,0,0,0,0,0,0
Football is great to watch,0,0,1,0,1,0,0,0,0,0,0,0,1


这个是doc-term matrix，课上讲的是term-doc matrix

In [19]:
pd.DataFrame(dtm.toarray(),index=example,columns=vectorizer.get_feature_names_out()).head(10)

Unnamed: 0,cool,data,football,fun,great,learning,like,machine,python,science,statistics,super,watch
Machine learning is super fun,0,0,0,1,0,1,0,1,0,0,0,1,0
"Python is super, super cool",1,0,0,0,0,0,0,0,1,0,0,2,0
"Statistics is cool, too",1,0,0,0,0,0,0,0,0,0,1,0,0
Data science is fun,0,1,0,1,0,0,0,0,0,1,0,0,0
Python is great for machine learning,0,0,0,0,1,1,0,1,1,0,0,0,0
I like football,0,0,1,0,0,0,1,0,0,0,0,0,0
Football is great to watch,0,0,1,0,1,0,0,0,0,0,0,0,1


将这个document-term matrix进行SVD分解，并且只保留2个topic

In [20]:
# Fit LSA. Use algorithm = “randomized” for large datasets
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtmfloat = dtm.astype(float)
dtm_lsa = lsa.fit_transform(dtmfloat)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

下面打印一下矩阵，转换为我们熟悉的$USV^T$的命名方式

In [21]:
UT = pd.DataFrame(lsa.components_,index = ["component_1","component_2"],columns = vectorizer.get_feature_names_out())
U = UT.T
U

Unnamed: 0,component_1,component_2
cool,0.280004,0.36527
data,0.035353,-0.064548
football,0.033417,-0.298349
fun,0.223993,-0.168056
great,0.178307,-0.478428
learning,0.338085,-0.366379
like,0.004555,-0.082792
machine,0.338085,-0.366379
python,0.391281,0.001036
science,0.035353,-0.064548


In [22]:
V = pd.DataFrame(dtm_lsa, index = example, columns = ["component_1","component_2"])
V

Unnamed: 0,component_1,component_2
Machine learning is super fun,0.957024,-0.290007
"Python is super, super cool",0.856484,0.516174
"Statistics is cool, too",0.563355,0.826215
Data science is fun,0.704171,-0.71003
Python is great for machine learning,0.717284,-0.696781
I like football,0.099136,-0.995074
Football is great to watch,0.235618,-0.971846


现在测试query，将query构造为pseudo document，使用
$$
query = q^T U_2 S_2
$$

我们现在拿到一个字符串，做为q，将其转换为query

In [23]:
S = np.diag(lsa.singular_values_)
S

array([[2.88718938, 0.        ],
       [0.        , 2.14559669]])

In [24]:
# 转换为pseudo doc
q = vectorizer.transform(["fun great"])

In [25]:
# 将doc转换到latent space
q@U@S

array([[ 1.16151543, -1.38709204]])

测试LSI.py中的代码

In [26]:
from Common.LSI import SKLearnLSA
docs = ["Machine learning is super fun",
"Python is super, super cool",
"Statistics is cool, too",
"Data science is fun",
"Python is great for machine learning",
"I like football",
"Football is great to watch"]

model = SKLearnLSA()
model.BuildModel(docs,2)
model.U

Unnamed: 0,component_1,component_2
cool,0.280004,0.36527
data,0.035353,-0.064548
football,0.033417,-0.298349
fun,0.223993,-0.168056
great,0.178307,-0.478428
learning,0.338085,-0.366379
like,0.004555,-0.082792
machine,0.338085,-0.366379
python,0.391281,0.001036
science,0.035353,-0.064548


In [27]:
model.V

Unnamed: 0,component_1,component_2
Machine learning is super fun,0.957024,-0.290007
"Python is super, super cool",0.856484,0.516174
"Statistics is cool, too",0.563355,0.826215
Data science is fun,0.704171,-0.71003
Python is great for machine learning,0.717284,-0.696781
I like football,0.099136,-0.995074
Football is great to watch,0.235618,-0.971846


In [28]:
model.S

array([[2.88718938, 0.        ],
       [0.        , 2.14559669]])

In [29]:
qstrlist = ["fun great"]
model.Query2LatentSpace(qstrlist)

array([[ 1.16151543, -1.38709204]])