# 5.2 LSA


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

### Load Data


In [2]:
data = pd.read_csv("news_articles.csv")


In [3]:
data.head()


Unnamed: 0,id,title,content
0,25626,"One Weight-Loss Approach Fits All? No, Not Eve...","Dr. Frank Sacks, a professor of nutrition at H..."
1,19551,South Carolina Stuns Baylor to Reach the Round...,South Carolina’s win over Duke was not only ...
2,25221,"U.S. Presidential Race, Apple, Gene Wilder: Yo...",(Want to get this briefing by email? Here’s th...
3,18026,"His Predecessor Gone, Gambia’s New President F...","BANJUL, Gambia — A week after he was inaugu..."
4,21063,‘Harry Potter and the Cursed Child’ Goes From ...,The biggest book of the summer isn’t a blockbu...


In [4]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       100 non-null    int64 
 1   title    100 non-null    object
 2   content  100 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


In [5]:
data_sample = data.sample(20).reset_index()


In [6]:
# take just the content of the article, lowercase and remove punctuation
articles = data_sample['content'].str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

In [7]:
# stop word removal
en_stopwords = stopwords.words('english')
articles = articles.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [8]:
# tokenize
articles = articles.apply(lambda x: word_tokenize(x))

In [9]:
# stemming (done for speed as we have a lot of text)
ps = PorterStemmer()
articles = articles.apply(lambda tokens: [ps.stem(token) for token in tokens])

In [10]:
articles


0     [call, claus, sahar, kian, need, new, roommat,...
1     [south, carolina, win, duke, surpris, fan, pos...
2     [messeng, show, bushwick, brooklyn, sunday, la...
3     [mogul, russel, simmon, team, univers, music, ...
4     [presid, trump, ask, interview, saturday, resp...
5     [complaint, gretchen, carlson, former, fox, ne...
6     [lo, angel, shortli, drop, youngest, daughter,...
7     [budapest, agn, galgoczi, 84, longer, make, to...
8     [meet, donald, j, trump, nation, tech, elit, h...
9     [beij, gener, war, resist, japanes, aggress, i...
10    [storr, conn, one, year, ago, night, super, bo...
11    [tuesday, presid, obama, pardon, commut, sente...
12    [want, get, brief, email, here, good, even, he...
13    [minneapoli, april, 21, news, spread, princ, f...
14    [year, polit, convent, pull, amaz, feat, reboo...
15    [new, york, citi, subway, train, jam, capac, p...
16    [tesla, motor, maverick, maker, scrutini, fede...
17    [republican, parti, trek, dark, took, fate

In [11]:
# create dictionary of all words
dictionary = corpora.Dictionary(articles)
print(dictionary)

NameError: name 'corpora' is not defined

In [12]:
# vecotize using bag of words into a document term matrix
doc_term = [dictionary.doc2bow(text) for text in articles]

NameError: name 'dictionary' is not defined

In [None]:
print(doc_term)


## LSA

In [None]:

# specify number of topics
num_topics = 2

In [None]:
# create LSA model
lsamodel = LsiModel(doc_term, num_topics=num_topics, id2word = dictionary) 
print(lsamodel.print_topics(num_topics=num_topics, num_words=5))

## Determine Number of Topics


In [None]:
# generate coherence scores to determine an optimum number of topics
coherence_values = []
model_list = []

min_topics = 2
max_topics = 11

for num_topics_i in range(min_topics, max_topics+1):
    model = LsiModel(doc_term, num_topics=num_topics_i, id2word = dictionary)
    model_list.append(model)
    coherence_model = CoherenceModel(model=model, texts=articles, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherence_model.get_coherence())

In [None]:
plt.plot(range(min_topics, max_topics+1), coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
final_n_topics = 2
lsamodel_f = LsiModel(doc_term, num_topics=final_n_topics, id2word = dictionary) 
print(lsamodel_f.print_topics(num_topics=final_n_topics, num_words=5))