In [18]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [52]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load Dataset
data = pd.read_csv('../data/abcnews-date-text.csv', error_bad_lines=False, nrows=10000);
documents_list= data['headline_text'].tolist()

In [21]:
# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                       min_df=10)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)   

In [22]:
# Define the number of topics or components
num_components=5

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_

In [23]:
# Print the topics with their terms
terms = tfidf.get_feature_names()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['war', 'plan', 'iraq', 'anti', 'baghdad', 'protest', 'iraqi']
Topic 1:  ['water', 'new', 'govt', 'hospital', 'wa', 'concerns', 'car']
Topic 2:  ['world', 'cup', 'council', 'group', 'clash', 'funds', 'denies']
Topic 3:  ['iraq', 'says', 'troops', 'war', 'killed', 'saddam', 'baghdad']
Topic 4:  ['man', 'police', 'charged', 'death', 'court', 'face', 'missing']


In [26]:
import pyLDAvis
import pyLDAvis.sklearn

In [49]:
ldavis= pyLDAvis.sklearn.prepare(model, train_data, tfidf)

In [50]:
pyLDAvis.save_html(ldavis, 'ldavis.html')

In [66]:
columns = [f'P(topic {i+1})' for i in range(len(model.components_))]
df_result = pd.DataFrame(lda_matrix, columns=columns)

In [67]:
df_result.head()

Unnamed: 0,P(topic 1),P(topic 2),P(topic 3),P(topic 4),P(topic 5)
0,0.598614,0.100001,0.101293,0.100091,0.100001
1,0.596681,0.102181,0.101136,0.100001,0.100001
2,0.083036,0.083305,0.08303,0.6676,0.08303
3,0.328767,0.05639,0.058485,0.05547,0.500888
4,0.446968,0.067111,0.067682,0.071626,0.346613
