In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [1]:
df = pd.read_csv('../input/articles/articles.csv')

In [1]:
df.head()

In [1]:
df.isnull().sum() ## Check for empty data

In [1]:
df['Article'][1029]

In [1]:
len(df)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
cvr = CountVectorizer(max_df=0.9, min_df=2, stop_words='english') 
# max_df = discard 90% of the words that are common in all documents/rows
# min_df = check freq of a word so that it should be common in atleast 2 documents

> `Applying un-supervised learning`

In [1]:
dtm = cvr.fit_transform(df['Article'])

In [1]:
dtm

`Let's perform Latent Dirichlet Allocation using Scikit-Learn`

In [1]:
from sklearn.decomposition import LatentDirichletAllocation
import torch
torch.cuda.is_available()

In [1]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"

device = torch.device(dev)

In [1]:
LDA = LatentDirichletAllocation(n_components=7, random_state=42) # n_components = Topics

In [1]:
LDA

In [1]:
LDA.fit(dtm).to(device)

In [1]:
print(f'{"DONE FINALLY!!!!!!"}')

In [1]:
# Grab vocabulary of words
len(cvr.get_feature_names())

In [1]:
type(cvr.get_feature_names())

In [1]:
cvr.get_feature_names()[1029]

In [1]:
## We can import any random words from the list(54777)
import random

random_word_id = random.randint(0, 54777)
cvr.get_feature_names()[random_word_id]

In [1]:
# Grab the topics
len(LDA.components_)## checking the length of components/topics

In [1]:
type(LDA.components_) ## Just a numpy array containing probabilities of each word

In [1]:
# Grab single topic out of those 7 components
single_topic = LDA.components_[0] # grabbing very first topic

In [1]:
single_topic.argsort() # returns index position to sort the array from lowest value to highest value

In [1]:
## example how argsort() works
import numpy as np

arr = np.array([5, 100, 23, 1])
print(f'Simple array:- {arr}')
print(f'Argsort:- {arr.argsort()}') ## will return index value of the numbers in an ascending order

In [1]:
# Let's grab top 10 values (top 10 greatest values) from single_topic using argsort()

single_topic.argsort()[-10:] ## since argsort() works in ascending order, hence, [-10:] is bringing last 10 greatest values

In [1]:
top_10_words = single_topic.argsort()[-10:]

In [1]:
for index in top_10_words:
    print(cvr.get_feature_names()[index])

In [1]:
# The above was for first topic. Let's do it for 3rd topic and grab top 20 words
third_topic = LDA.components_[2]
third_topic.argsort()
top_20_words_in_3rd = third_topic.argsort()[-20:]

for i in top_20_words_in_3rd:
    print(cvr.get_feature_names()[i])

In [1]:
# Grab the highest probability words per topic
for i, topic in enumerate(LDA.components_):
    print(f'TOP 15 WORDS FOR TOPIC #{i}')
    print([cvr.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n\n')

In [1]:
topic_results = LDA.transform(dtm)

In [1]:
topic_results

In [1]:
topic_results.shape

In [1]:
# Probabilities belonging to a particular topic
print(topic_results[0])

print(f'-------------------------------------------------------------')
# Percentages in a rounded off form
print(topic_results[0].round(2))

In [1]:
# Getting index position of the highest probability
topic_results[0].argmax()

In [1]:
df['Topics'] = topic_results.argmax(axis=1)

In [1]:
df ## which all rows of article data are under general topics