In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

import nltk
from nltk.tokenize import word_tokenize
from gensim.parsing import strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
from gensim.parsing import preprocess_string
from gensim import parsing

import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import glob

import warnings
warnings.filterwarnings("ignore")

In [None]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

porter = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\s+\w{1}\s+', '', text)
    return text

def preprocessor(text):
    text = clean_text(text)
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = text.split()
    tokens = [porter.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# PROBLEM 1: Topic Models

# 20 NewsGroups : news articles

In [None]:
# Load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# Apply the preprocessor to the training data
X_train = [preprocessor(text) for text in newsgroups_train.data]
y_train = newsgroups_train.target

# Apply the preprocessor to the test data
X_test = [preprocessor(text) for text in newsgroups_test.data]
y_test = newsgroups_test.target

In [None]:
newsgroups_test = fetch_20newsgroups(subset='test')
X = [preprocessor(text) for text in newsgroups_test.data]
y = newsgroups_test.target

In [None]:
# Converting the text data into a document-term matrix
vectorizer = CountVectorizer(max_df = 0.98, min_df = 2, stop_words ='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_index, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_index)
        sorted_indices = topic.argsort()[::-1][:n_top_words]
        for i in sorted_indices:
            print("%s (%.3f)" % (feature_names[i], topic[i] / topic.sum()))
        print("")

In [None]:
num_top_words = 20

In [None]:
# LDA with K = 10

lda_10 = LatentDirichletAllocation(n_components = 10, learning_method = 'online', max_iter = 10, n_jobs = -1, random_state = 42)
lda_10.fit(X_train)

In [None]:
# Printing the top words for LDA with K=10

print("LDA with K = 10 : ")
print_top_words(lda_10, vectorizer.get_feature_names_out(), num_top_words)

LDA with K = 10 : 
Topic #0:
space (0.009)
armenian (0.009)
israel (0.007)
isra (0.006)
turkish (0.005)
jew (0.004)
orbit (0.004)
arab (0.004)
center (0.004)
world (0.004)
launch (0.003)
kill (0.003)
peopl (0.003)
war (0.003)
said (0.003)
year (0.003)
state (0.003)
nation (0.003)
univers (0.003)
organ (0.003)

Topic #1:
peopl (0.012)
god (0.010)
christian (0.010)
believ (0.009)
law (0.008)
say (0.008)
exist (0.008)
moral (0.006)
evid (0.006)
mean (0.006)
religion (0.006)
reason (0.005)
human (0.005)
right (0.005)
church (0.005)
claim (0.005)
write (0.005)
mani (0.005)
think (0.005)
argument (0.005)

Topic #2:
maxaxaxaxaxaxaxaxaxaxaxaxaxaxax (0.098)
bike (0.019)
dod (0.016)
ride (0.011)
car (0.009)
water (0.008)
oil (0.008)
bmw (0.006)
organ (0.006)
helmet (0.005)
tire (0.005)
line (0.005)
nick (0.005)
mile (0.005)
engin (0.005)
chi (0.004)
rider (0.004)
van (0.004)
cool (0.004)
tower (0.004)

Topic #3:
organ (0.018)
sale (0.013)
line (0.013)
car (0.013)
nntppostinghost (0.011)
univers 

In [None]:
# NMF with K = 10

nmf_10 = NMF(n_components = 10, init='random', random_state = 42)
nmf_10.fit(X_train)

In [None]:
# Printing the top words for NMF with K = 10

print("NMF with K = 10 : ")
print_top_words(nmf_10, vectorizer.get_feature_names_out(), num_top_words)

NMF with K = 10 : 
Topic #0:
wire (0.039)
use (0.017)
ground (0.015)
outlet (0.012)
circuit (0.010)
connect (0.010)
neutral (0.009)
electr (0.007)
cabl (0.007)
box (0.007)
requir (0.006)
nec (0.006)
gfci (0.006)
hot (0.006)
usual (0.006)
insul (0.006)
power (0.006)
protect (0.005)
instal (0.005)
conductor (0.005)

Topic #1:
peopl (0.011)
say (0.008)
know (0.006)
god (0.005)
said (0.005)
like (0.005)
armenian (0.005)
time (0.005)
come (0.004)
dont (0.004)
thing (0.004)
didnt (0.004)
someth (0.003)
mani (0.003)
right (0.003)
organ (0.003)
write (0.003)
way (0.003)
want (0.003)
think (0.003)

Topic #2:
use (0.013)
imag (0.010)
avail (0.008)
anonym (0.007)
program (0.006)
inform (0.006)
data (0.006)
version (0.005)
file (0.005)
includ (0.005)
user (0.005)
softwar (0.004)
internet (0.004)
post (0.004)
jpeg (0.004)
widget (0.004)
comput (0.004)
server (0.004)
window (0.004)
mail (0.004)

Topic #3:
file (0.068)
gun (0.020)
jpeg (0.012)
state (0.011)
control (0.011)
firearm (0.011)
amend (0.00

In [None]:
# LDA with K = 20

lda_20 = LatentDirichletAllocation(n_components = 20, learning_method = 'online', max_iter = 10, n_jobs = -1, random_state = 42)
lda_20.fit(X_train)

In [None]:
# Printing the top words for LDA with K = 20

print("LDA with K = 20 : ")
print_top_words(lda_20, vectorizer.get_feature_names_out(), num_top_words)


LDA with K = 20 : 
Topic #0:
space (0.010)
inform (0.006)
includ (0.005)
program (0.005)
list (0.005)
new (0.005)
nation (0.004)
orbit (0.004)
center (0.004)
turkish (0.004)
anonym (0.004)
launch (0.004)
april (0.004)
post (0.004)
project (0.004)
gener (0.004)
book (0.004)
unit (0.004)
send (0.003)
world (0.003)

Topic #1:
god (0.022)
christian (0.012)
peopl (0.009)
say (0.009)
jesu (0.008)
believ (0.008)
exist (0.007)
bibl (0.006)
religion (0.005)
write (0.005)
mean (0.005)
church (0.005)
line (0.005)
question (0.005)
word (0.005)
mani (0.004)
faith (0.004)
reason (0.004)
life (0.004)
think (0.004)

Topic #2:
water (0.023)
oil (0.018)
cool (0.014)
engin (0.013)
ga (0.011)
bmw (0.011)
air (0.009)
tower (0.009)
hot (0.008)
plant (0.008)
cylind (0.008)
nuclear (0.007)
fuel (0.007)
dod (0.007)
heat (0.007)
cold (0.006)
atf (0.006)
gt (0.006)
mile (0.006)
tx (0.006)

Topic #3:
imag (0.025)
jpeg (0.014)
__ (0.013)
gif (0.013)
xterm (0.010)
pictur (0.010)
graphic (0.010)
appear (0.008)
oo (0

In [None]:
# NMF with K = 20

nmf_20 = NMF(n_components = 20, init='random', random_state = 42)
nmf_20.fit(X_train)

In [None]:
# Printing the top words for NMF with K = 20

print("NMF with K = 20 : ")
print_top_words(nmf_20, vectorizer.get_feature_names_out(), num_top_words)

NMF with K = 20 : 
Topic #0:
wire (0.043)
ground (0.017)
use (0.017)
outlet (0.013)
circuit (0.011)
connect (0.011)
neutral (0.010)
electr (0.008)
cabl (0.008)
box (0.007)
nec (0.007)
gfci (0.007)
requir (0.007)
hot (0.007)
usual (0.006)
insul (0.006)
power (0.006)
protect (0.005)
instal (0.005)
conductor (0.005)

Topic #1:
peopl (0.016)
say (0.013)
armenian (0.012)
said (0.012)
know (0.011)
didnt (0.009)
come (0.007)
apart (0.007)
someth (0.007)
happen (0.006)
azerbaijani (0.006)
went (0.006)
kill (0.006)
like (0.006)
came (0.006)
dont (0.006)
start (0.006)
time (0.005)
live (0.005)
everyth (0.005)

Topic #2:
imag (0.021)
avail (0.012)
data (0.012)
graphic (0.008)
packag (0.008)
ftp (0.007)
includ (0.007)
program (0.007)
format (0.007)
softwar (0.006)
use (0.006)
version (0.006)
contact (0.006)
support (0.006)
tool (0.005)
comput (0.005)
process (0.005)
model (0.005)
send (0.005)
sun (0.004)

Topic #3:
anonym (0.033)
post (0.018)
internet (0.017)
privaci (0.013)
email (0.011)
user (0.

In [None]:
# LDA with K = 50

lda_50 = LatentDirichletAllocation(n_components = 50, learning_method = 'online', max_iter = 10, random_state = 42)
lda_50.fit(X_train)

In [None]:
# Printing the top words for LDA with K = 50

print("LDA with K = 50 : ")
print_top_words(lda_50, vectorizer.get_feature_names_out(), num_top_words)

LDA with K = 50 : 
Topic #0:
use (0.009)
inform (0.007)
program (0.004)
new (0.004)
research (0.004)
gener (0.004)
includ (0.004)
data (0.004)
scienc (0.004)
post (0.004)
center (0.004)
work (0.003)
line (0.003)
number (0.003)
list (0.003)
year (0.003)
time (0.003)
follow (0.003)
oper (0.003)
univers (0.003)

Topic #1:
imag (0.081)
book (0.048)
graphic (0.030)
format (0.030)
ftp (0.022)
jpeg (0.020)
gif (0.017)
convert (0.013)
version (0.011)
program (0.011)
sgi (0.010)
file (0.010)
view (0.009)
risc (0.009)
creat (0.008)
silicon (0.007)
viewer (0.007)
meter (0.007)
bitmap (0.007)
xv (0.006)

Topic #2:
mtl (0.017)
wsh (0.016)
membran (0.014)
hfd (0.011)
cgi (0.011)
edm (0.010)
wpg (0.009)
mucu (0.009)
saku (0.009)
keypad (0.008)
promopictur (0.008)
cotton (0.007)
fleme (0.007)
pasteur (0.007)
lxext (0.006)
predomin (0.006)
toricelli (0.006)
alu (0.006)
sun_ (0.005)
larsson (0.005)

Topic #3:
sale (0.156)
offer (0.065)
purchas (0.025)
band (0.024)
bag (0.022)
forsal (0.019)
black (0.019

In [None]:
# NMF with K = 50

nmf_50 = NMF(n_components = 50, init='random', random_state = 42)
nmf_50.fit(X_train)

In [None]:
# Printing the top words for NMF with K = 50

print("NMF with K = 50 : ")
print_top_words(nmf_50, vectorizer.get_feature_names_out(), num_top_words)

NMF with K = 50 : 
Topic #0:
avail (0.024)
version (0.014)
xr (0.014)
includ (0.013)
widget (0.013)
use (0.010)
file (0.010)
server (0.010)
support (0.009)
sun (0.009)
motif (0.009)
inform (0.009)
sourc (0.009)
export (0.009)
distribut (0.008)
set (0.007)
softwar (0.007)
program (0.006)
run (0.006)
binari (0.006)

Topic #1:
gun (0.025)
firearm (0.016)
weapon (0.013)
law (0.013)
rate (0.011)
crime (0.010)
homicid (0.010)
section (0.009)
use (0.008)
author (0.008)
vancouv (0.007)
handgun (0.007)
applic (0.007)
issu (0.007)
control (0.007)
seattl (0.007)
person (0.006)
differ (0.006)
citi (0.006)
state (0.006)

Topic #2:
version (0.023)
machin (0.022)
contact (0.021)
type (0.021)
avail (0.017)
ftp (0.017)
comment (0.015)
keyboard (0.014)
anonym (0.012)
pc (0.011)
mac (0.010)
comput (0.009)
ibm (0.008)
program (0.008)
algebra (0.008)
phone (0.007)
sun (0.007)
number (0.007)
gener (0.007)
price (0.005)

Topic #3:
internet (0.020)
privaci (0.019)
anonym (0.015)
email (0.014)
comput (0.014)
i

# DUC 2001 summarization dataset

In [None]:
path = '/content/drive/MyDrive/DUC2001'

contents = []
summaries = []

data = { 'Article' : [] , 'Content' : [] , 'Summary' : [] }

for name in glob.glob(path + '/*'):
    
    filename  = os.path.basename(name)
    contents = ''
    summaries = ''

    try:
        if filename == 'annotations.txt' or filename in 'notes.txt':
            continue
            
        with open(path + '/Summaries/{}.txt'.format(filename.lower())) as file:
            f = file.read()
            abs = f.find('Abstract:')
            len_abs = len('Abstract:')
            intr = f.find('Introduction:')
            len_intr = len('Introduction:')
            
            summaries = f[(abs+len_abs):intr] 
            contents = f[(intr+len_intr):]
            
    except:
        continue
        
    data['Article'].append(filename)
    data['Summary'].append(summaries.strip().replace('\n', ' '))
    data['Content'].append(contents.strip().replace('\n', ' ').replace('    ', ' ').replace(' \x1a', ''))

In [None]:
duc_df = pd.DataFrame(data)

In [None]:
duc_df.head(10)

Unnamed: 0,Article,Content,Summary
0,FT922-10200,LOS ANGELES Police Chief Daryl Gates had been ...,Ten months after the Christopher Commission ca...
1,WSJ900918-0121,"MEMPHIS, Tenn. -- Friday nights used to be slo...",The prediction of a major earthquake along the...
2,WSJ910107-0139,"BETHESDA, Md. -- Eva Mitrova, a visiting resea...","About 20,000 British cattle have been destroye..."
3,FT941-1750,Central American and Caribbean governments are...,Central American and Caribbean governments awa...
4,SJMN91-06105230,Suicide or sense? The answer may decide the me...,The Boston Marathon has a storied past. It has...
5,SJMN91-06084228,SPEED TRAP; By Charlie Francis with Jeff Coplo...,Canadian sprint coach Charlie Francis testifyi...
6,LA081890-0039,"Alphonce Swai, who ran in the 1984 Olympics fo...",Marathon runner Alphonce Swai is continuing hi...
7,SJMN91-06184021,"The label ""black conservative,"" now firmly aff...","Speaking on national television on Monday, Cla..."
8,AP890227-0016,"One of nature's most vicious spectacles, the t...",In 1988 the nation's more than 700 tornadoes c...
9,AP881216-0017,A recommended halt to the government's ``let i...,A panel was assembled last September after the...


In [None]:
duc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  301 non-null    object
 1   Content  301 non-null    object
 2   Summary  301 non-null    object
dtypes: object(3)
memory usage: 7.2+ KB


In [None]:
transform_to_lower = lambda s: s.lower()
remove_emails = lambda s: re.sub(r'^[a-zA-Z0-9+_.-]+@[a-zA-Z0-9.-]+$', '', s)
remove_single_char = lambda s: re.sub(r'\s+\w{1}\s+', '', s)

CLEAN_FILTERS = [
                remove_emails,
                strip_tags,
                strip_numeric,
                remove_emails,
                strip_punctuation, 
                strip_multiple_whitespaces, 
                transform_to_lower,
                remove_stopwords]

def cleaningPipe(document):
    processed_words = preprocess_string(document, CLEAN_FILTERS)
    
    return processed_words

def joinList(processed_words):
    return ' '.join(processed_words)

def basicStemming(text):
    return parsing.stem_text(text)

In [None]:
duc_df["CleanedText"] = duc_df["Content"].apply(cleaningPipe).apply(joinList).apply(basicStemming)

In [None]:
duc_df.head(10)

Unnamed: 0,Article,Content,Summary,CleanedText
0,FT922-10200,LOS ANGELES Police Chief Daryl Gates had been ...,Ten months after the Christopher Commission ca...,lo angel polic chief daryl gate accus nurtur b...
1,WSJ900918-0121,"MEMPHIS, Tenn. -- Friday nights used to be slo...",The prediction of a major earthquake along the...,memphi tenn fridai night slow fault line night...
2,WSJ910107-0139,"BETHESDA, Md. -- Eva Mitrova, a visiting resea...","About 20,000 British cattle have been destroye...",bethesda md eva mitrova visit research nation ...
3,FT941-1750,Central American and Caribbean governments are...,Central American and Caribbean governments awa...,central american caribbean govern await pass i...
4,SJMN91-06105230,Suicide or sense? The answer may decide the me...,The Boston Marathon has a storied past. It has...,suicid sens answer decid men s winner mondai s...
5,SJMN91-06084228,SPEED TRAP; By Charlie Francis with Jeff Coplo...,Canadian sprint coach Charlie Francis testifyi...,speed trap charli franci jeff coplon art amp b...
6,LA081890-0039,"Alphonce Swai, who ran in the 1984 Olympics fo...",Marathon runner Alphonce Swai is continuing hi...,alphonc swai ran olymp tanzania later fell pre...
7,SJMN91-06184021,"The label ""black conservative,"" now firmly aff...","Speaking on national television on Monday, Cla...",label black conserv firmli affix clarenc thoma...
8,AP890227-0016,"One of nature's most vicious spectacles, the t...",In 1988 the nation's more than 700 tornadoes c...,natur s viciou spectacl tornado pois renew ann...
9,AP881216-0017,A recommended halt to the government's ``let i...,A panel was assembled last September after the...,recommend halt govern s let burn forest polici...


In [None]:
# Converting the CleanedText data into a document-term matrix

vectorizer = CountVectorizer(max_df = 0.98, min_df = 2, stop_words = 'english')
data = vectorizer.fit_transform(np.array(duc_df['CleanedText']))
df_data = pd.DataFrame(data.toarray())

In [None]:
# LDA with K = 10

lda_10 = LatentDirichletAllocation(n_components = 10, learning_method = 'online', max_iter = 10, n_jobs = -1, random_state = 42)
lda_10.fit(df_data)

In [None]:
# Printing the top words for LDA with K=10

print("LDA with K = 10 : ")
print_top_words(lda_10, vectorizer.get_feature_names_out(), num_top_words)

LDA with K = 10 : 
Topic #0:
tunnel (0.030)
french (0.016)
british (0.011)
franc (0.010)
link (0.007)
project (0.007)
channel (0.007)
britain (0.006)
rail (0.006)
train (0.006)
hous (0.006)
time (0.006)
london (0.006)
billion (0.006)
pari (0.005)
near (0.005)
said (0.005)
cost (0.005)
pound (0.005)
year (0.005)

Topic #1:
said (0.031)
oil (0.011)
crash (0.011)
exxon (0.009)
plane (0.008)
offic (0.008)
spill (0.007)
polic (0.006)
mile (0.005)
air (0.005)
nafta (0.005)
valdez (0.005)
state (0.005)
offici (0.004)
report (0.004)
investig (0.004)
engin (0.004)
forc (0.004)
year (0.004)
beach (0.003)

Topic #2:
smoke (0.012)
lung (0.012)
firefight (0.009)
respiratori (0.005)
health (0.005)
wildland (0.005)
protect (0.005)
test (0.004)
hopkin (0.004)
carbon (0.004)
function (0.003)
ford (0.003)
poison (0.003)
hazard (0.003)
cough (0.003)
occup (0.003)
cancer (0.003)
chemic (0.003)
crew (0.003)
acid (0.003)

Topic #3:
said (0.019)
year (0.014)
welfar (0.013)
drought (0.010)
state (0.009)
perce

In [None]:
# NMF with K = 10

nmf_10 = NMF(n_components = 10, init='random', random_state = 42)
nmf_10.fit(df_data)

In [None]:
# Printing the top words for NMF with K = 10

print("NMF with K = 10 : ")
print_top_words(nmf_10, vectorizer.get_feature_names_out(), num_top_words)

NMF with K = 10 : 
Topic #0:
nra (0.035)
gun (0.033)
weapon (0.013)
assault (0.011)
said (0.011)
ban (0.011)
deconcini (0.010)
control (0.009)
year (0.009)
hammer (0.008)
member (0.007)
semiautomat (0.007)
rifl (0.006)
nation (0.006)
state (0.006)
legisl (0.006)
lobbi (0.005)
letter (0.005)
million (0.005)
baker (0.005)

Topic #1:
thoma (0.019)
right (0.010)
black (0.009)
shine (0.008)
path (0.008)
court (0.008)
peopl (0.008)
mr (0.007)
presid (0.006)
war (0.006)
law (0.006)
guzman (0.005)
committe (0.005)
govern (0.004)
new (0.004)
clarenc (0.004)
senat (0.004)
slovenia (0.004)
civil (0.004)
work (0.004)

Topic #2:
said (0.042)
crash (0.012)
hurrican (0.010)
plane (0.009)
engin (0.007)
air (0.006)
peopl (0.006)
report (0.005)
offici (0.005)
nation (0.005)
unit (0.005)
kill (0.005)
flight (0.004)
mile (0.004)
center (0.004)
storm (0.004)
state (0.004)
forc (0.004)
airlin (0.004)
caus (0.004)

Topic #3:
diamond (0.061)
beer (0.029)
market (0.020)
year (0.014)
cso (0.012)
botswana (0.010

In [None]:
# LDA with K = 20

lda_20 = LatentDirichletAllocation(n_components = 20, learning_method = 'online', max_iter = 10, n_jobs = -1, random_state = 42)
lda_20.fit(df_data)

In [None]:
# Printing the top words for LDA with K=20

print("LDA with K = 20 : ")
print_top_words(lda_20, vectorizer.get_feature_names_out(), num_top_words)

LDA with K = 20 : 
Topic #0:
tunnel (0.048)
french (0.023)
british (0.016)
franc (0.012)
link (0.011)
project (0.011)
channel (0.010)
rail (0.010)
train (0.010)
britain (0.009)
billion (0.009)
london (0.009)
eurotunnel (0.007)
pari (0.007)
time (0.007)
machin (0.007)
mile (0.007)
dig (0.006)
speed (0.006)
high (0.006)

Topic #1:
oil (0.031)
said (0.026)
exxon (0.025)
spill (0.018)
valdez (0.013)
beach (0.009)
state (0.007)
alaska (0.007)
jackson (0.007)
cleanup (0.007)
nafta (0.007)
mile (0.007)
coast (0.006)
tanker (0.006)
million (0.006)
ship (0.006)
offici (0.006)
sound (0.006)
compani (0.005)
environment (0.005)

Topic #2:
diamond (0.001)
hurrican (0.001)
year (0.001)
said (0.001)
firefight (0.001)
sai (0.000)
drug (0.000)
new (0.000)
health (0.000)
smoke (0.000)
control (0.000)
world (0.000)
offici (0.000)
market (0.000)
path (0.000)
nation (0.000)
studi (0.000)
diseas (0.000)
bank (0.000)
million (0.000)

Topic #3:
welfar (0.025)
drought (0.020)
year (0.016)
said (0.016)
diabet (

In [None]:
# NMF with K = 20

nmf_20 = NMF(n_components = 20, init='random', random_state = 42)
nmf_20.fit(df_data)

In [None]:
# Printing the top words for NMF with K = 20

print("NMF with K = 20 : ")
print_top_words(nmf_20, vectorizer.get_feature_names_out(), num_top_words)

NMF with K = 20 : 
Topic #0:
said (0.079)
count (0.010)
censu (0.009)
offic (0.009)
illeg (0.008)
state (0.008)
citi (0.007)
hous (0.007)
eclips (0.007)
offici (0.006)
alien (0.006)
senat (0.006)
tornado (0.006)
aid (0.005)
peopl (0.005)
resid (0.005)
center (0.005)
immigr (0.005)
seat (0.004)
counti (0.004)

Topic #1:
diseas (0.025)
diabet (0.019)
year (0.012)
health (0.009)
case (0.009)
cjd (0.009)
peopl (0.009)
dr (0.008)
sheep (0.008)
brain (0.008)
infect (0.008)
latino (0.007)
scientist (0.007)
like (0.007)
bse (0.007)
sai (0.007)
cow (0.006)
anim (0.006)
studi (0.006)
tuberculosi (0.006)

Topic #2:
tunnel (0.036)
french (0.015)
british (0.013)
link (0.012)
rail (0.009)
billion (0.009)
project (0.009)
britain (0.008)
london (0.008)
channel (0.008)
train (0.007)
eurotunnel (0.007)
high (0.007)
franc (0.007)
time (0.007)
speed (0.006)
mile (0.006)
sai (0.006)
europ (0.006)
worker (0.005)

Topic #3:
diamond (0.066)
beer (0.031)
market (0.022)
year (0.014)
cso (0.013)
botswana (0.011)

In [None]:
# LDA with K = 50

lda_50 = LatentDirichletAllocation(n_components = 50, learning_method = 'online', max_iter = 10, n_jobs = -1, random_state = 42)
lda_50.fit(df_data)

In [None]:
# Printing the top words for LDA with K=50

print("LDA with K = 50 : ")
print_top_words(lda_50, vectorizer.get_feature_names_out(), num_top_words)

LDA with K = 50 : 
Topic #0:
tunnel (0.058)
french (0.026)
british (0.019)
link (0.013)
channel (0.013)
project (0.012)
rail (0.012)
train (0.011)
franc (0.011)
britain (0.011)
london (0.011)
billion (0.009)
pari (0.009)
eurotunnel (0.009)
machin (0.009)
speed (0.007)
dig (0.007)
high (0.007)
calai (0.007)
time (0.006)

Topic #1:
prai (0.006)
god (0.003)
crowd (0.003)
chair (0.002)
hug (0.002)
thoroughfar (0.002)
sweet (0.002)
rub (0.002)
disguis (0.002)
anita (0.002)
sorrow (0.002)
harass (0.002)
excus (0.002)
hometown (0.002)
nervous (0.002)
rd (0.002)
straighten (0.002)
said (0.002)
cast (0.002)
talli (0.002)

Topic #2:
lung (0.016)
smoke (0.014)
respiratori (0.011)
wildland (0.010)
function (0.009)
hopkin (0.008)
occup (0.008)
ford (0.007)
firefight (0.007)
irrit (0.007)
acid (0.006)
poison (0.006)
compound (0.005)
carbon (0.005)
bronchiti (0.005)
cough (0.005)
wear (0.005)
mask (0.005)
lawrenc (0.004)
laboratori (0.004)

Topic #3:
diabet (0.071)
hispan (0.044)
latino (0.015)
ameri

In [None]:
# NMF with K = 50

nmf_50 = NMF(n_components = 50, init='random', random_state = 42)
nmf_50.fit(df_data)

In [None]:
# Printing the top words for NMF with K = 50

print("NMF with K = 50 : ")
print_top_words(nmf_50, vectorizer.get_feature_names_out(), num_top_words)

NMF with K = 50 : 
Topic #0:
year (0.053)
percent (0.046)
billion (0.025)
crop (0.020)
product (0.019)
food (0.018)
million (0.018)
drought (0.014)
total (0.013)
wheat (0.010)
wilson (0.010)
estim (0.010)
usda (0.010)
spring (0.009)
unit (0.009)
said (0.009)
price (0.009)
corn (0.009)
harvest (0.008)
weather (0.008)

Topic #1:
state (0.041)
court (0.038)
limit (0.032)
term (0.027)
candid (0.026)
incumb (0.018)
constitut (0.015)
ballot (0.014)
run (0.013)
year (0.013)
restrict (0.013)
suprem (0.013)
right (0.012)
congress (0.011)
elect (0.011)
offic (0.010)
decis (0.010)
rule (0.009)
amend (0.009)
serv (0.008)

Topic #2:
welfar (0.030)
countri (0.026)
cost (0.023)
state (0.023)
cent (0.021)
increas (0.020)
rise (0.015)
ag (0.014)
famili (0.011)
number (0.010)
peopl (0.010)
econom (0.009)
growth (0.009)
benefit (0.009)
social (0.008)
popul (0.008)
healthcar (0.008)
need (0.008)
pressur (0.008)
spend (0.008)

Topic #3:
diamond (0.055)
cso (0.035)
botswana (0.033)
market (0.030)
beer (0.01