<a href="https://colab.research.google.com/github/sowmyarshetty/NNClass/blob/main/Latent_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import random
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Set the column width
pd.set_option('max_colwidth', 200)

In [2]:
import gdown
# Mount Google Drive (For Colab Users)
from google.colab import drive
drive.mount('/content/drive')


#URL for downloading lung cancer data
url_lc = 'https://drive.google.com/uc?id=1dCjysGH7CjseeifERUin9gaoKchtcGnh'
gdown.download(url_lc, 'news_article.csv', quiet=False)
# https://drive.google.com/file/d/1dCjysGH7CjseeifERUin9gaoKchtcGnh/view?usp=drive_link


#Read the Lung cancer data.csv
news_articles_df = pd.read_csv(url_lc)
news_articles_df.head()


Mounted at /content/drive


Downloading...
From: https://drive.google.com/uc?id=1dCjysGH7CjseeifERUin9gaoKchtcGnh
To: /content/news_article.csv
100%|██████████| 1.43M/1.43M [00:00<00:00, 18.2MB/s]


Unnamed: 0,headline
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video
3,How To Say 'Cheers' In 20 Languages (AUDIO)
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection


In [3]:
# Load the news_articles.csv into a DataFrame.
# news_articles_df = pd.read_csv('Resources/news_articles.csv')
# Display the first 20 headlines
news_articles_df.head(10)

Unnamed: 0,headline
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video
3,How To Say 'Cheers' In 20 Languages (AUDIO)
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts
8,The Major Problem With Electric Cars | TIME.com
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?


## Preprocess the Text

In [4]:
# Get the info on the DataFrame
news_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23377 entries, 0 to 23376
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  23376 non-null  object
dtypes: object(1)
memory usage: 182.8+ KB


In [5]:
# Remove digits and non-alphabetic characters
#lamba func is 2 parts , lamba variable : expression
news_articles_df["headline"] = news_articles_df["headline"].apply(lambda x: re.sub(r"[^a-zA-Z\s ]","",str(x)))
news_articles_df.head()

Unnamed: 0,headline
0,Is Too Young To Marry A YearOld The Bachelor Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New Delicate Video
3,How To Say Cheers In Languages AUDIO
4,Welcome To Hell Rio Police Warn They Cant Promise Olympic Protection


## Process the Text to Tokens and Counts.

In [6]:
# Create an instance of the CountVectorizer and set the max_df to 0.95 and min_df to 10, and use the "english" stopwords.
#max_df = remove words that appear more than 95% of the word
#min_df = word must appear in atleast 10 documents to be included
cv = CountVectorizer(max_df=0.95,min_df=10, stop_words='english')
cv

In [7]:
# Get the headlines.
headlines = news_articles_df["headline"]
headlines

Unnamed: 0,headline
0,Is Too Young To Marry A YearOld The Bachelor Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New Delicate Video
3,How To Say Cheers In Languages AUDIO
4,Welcome To Hell Rio Police Warn They Cant Promise Olympic Protection
...,...
23372,Bidens Health Agenda Starts With Reversing Everything Trump Did In The Last Years
23373,You Know Where You Are From the Very First Bite
23374,Cheeses We Would Happily Marry If That Was Allowed
23375,Donald Trump Has A Surprising Response To Golfer Rory McIlroys Criticism


In [8]:
# Transform each row from the headlines Series to a DTM.
#DTM - Document Term Matrix . How much a specific term appears in the entire collection of documents
dtm = cv.fit_transform(headlines)
# Get the shape of the DTM.
print(dtm.shape)
#rows and columns

(23377, 3149)


In [9]:
# Get the length of the vocabulary
len(cv.get_feature_names_out())

3149

In [10]:
# Look at 100 random words in the vocabulary
print(cv.get_feature_names_out()[:100])

['aaron' 'abandoned' 'ability' 'able' 'abortion' 'abroad' 'absolutely'
 'abuse' 'abused' 'access' 'accident' 'accidental' 'accidentally'
 'according' 'account' 'accounts' 'accusations' 'accused' 'accuser'
 'accusers' 'accuses' 'act' 'acting' 'action' 'activists' 'actor'
 'actress' 'actually' 'ad' 'adam' 'add' 'added' 'adding' 'address' 'adds'
 'adele' 'administration' 'admits' 'adorable' 'ads' 'adults' 'adventure'
 'adventures' 'advice' 'adviser' 'advocates' 'affleck' 'afford'
 'affordable' 'afghanistan' 'africa' 'age' 'agency' 'agenda' 'agent' 'ago'
 'agree' 'agrees' 'ahead' 'aid' 'aide' 'aides' 'aim' 'aims' 'air' 'airbnb'
 'airline' 'airlines' 'airplane' 'airport' 'airports' 'al' 'alabama'
 'alaska' 'album' 'alec' 'alex' 'alexandria' 'ali' 'alive' 'allegations'
 'alleged' 'allegedly' 'allen' 'allies' 'allow' 'allowed' 'allstar'
 'alternative' 'amazing' 'amazon' 'amazons' 'ambassador' 'amber' 'america'
 'american' 'americans' 'americas' 'amid' 'amy']


In [11]:
# Print the first 500 elements (transformed words)from the 1st row, i.e., document.
#converting the matrix to array to print it out
print(dtm.toarray()[0][:500])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [12]:
# Get the feature names (words) from the CountVectorizer
feature_names = cv.get_feature_names_out()

# Get all the non-zero elements from the first row.
non_zero_elements = dtm.toarray()[0]

# Get the indices for each non-zero element.
non_zero_indices = non_zero_elements.nonzero()[0]

# Print out the word and the number of times the word is in the row.
for i in non_zero_indices:
   print(f"The word : {feature_names[i]} - index : {i} - count : {non_zero_elements[i]}")


The word : bachelor - index : 183 - count : 1
The word : yearold - index : 3131 - count : 1
The word : young - index : 3138 - count : 1


In [13]:
# Convert the DTM to a DataFrame

dtm_df = pd.DataFrame(dtm.toarray(),columns=cv.get_feature_names_out())
# Display some random columns and the first 20 rows of the DataFrame.
dtm_df.iloc[:,133:149:].head(20)

Unnamed: 0,argentina,argument,ariana,arizona,army,arrest,arrested,art,article,artificial,ashley,asian,ask,asked,asking,asks
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## LDA

In [14]:
# Create and instance of the LatentDirichletAllocation() class with 7 topics.
#business, politics, entertainment, food and drink, sports, technology and travel
LDA = LatentDirichletAllocation(n_components=7, random_state = 45)
# Fit the model with our DTM data. This may take awhile if you have a large amount of documents.
LDA_data = LDA.fit(dtm)

In [15]:
# Get the values of each topic-word distribution.
topic_word_distributions = LDA_data.components_
print(topic_word_distributions)

[[ 6.68868359  0.1428575   1.16038864 ...  0.14351059  0.1428573
  25.14191545]
 [ 0.14331542  0.14300083  0.14298379 ...  0.14328801  0.14286797
   0.14295194]
 [ 0.14297809  2.89817877  3.14252277 ...  0.14302848  0.14285735
   0.14287649]
 ...
 [ 0.14292072  0.14289515  0.14291571 ...  1.6273334   0.1429087
   0.1429782 ]
 [ 0.14294079  0.14308772  0.14307667 ...  5.61167278 10.14258776
   0.14323906]
 [16.59615124 12.38685925  0.14297134 ...  5.1882435   0.14300355
   0.14301934]]


In [16]:
# Get the length of the array of each topic. It should be the same as the vocabulary.
for index, topic in enumerate(LDA_data.components_):
  print(len(LDA_data.components_[index]))




3149
3149
3149
3149
3149
3149
3149


In [17]:
# Get the array of the first topic
first_topic = LDA_data.components_[0]

# This is the ranking of each word in the array. Lower values have less impact than higher values.
print(first_topic)

[ 6.68868359  0.1428575   1.16038864 ...  0.14351059  0.1428573
 25.14191545]


In [18]:
# Get the indices for the first topic in descending order.
sorted_first_topic = np.argsort(-first_topic)


# Use the sorted indices to the values from greatest to least.
sorted_first_topic_values = first_topic[sorted_first_topic]
for value in sorted_first_topic_values:
  print(value)

664.532671200798
271.91271047790445
224.1372934650356
217.31934993450625
211.7751546784865
205.77015155661817
179.0408679028302
143.133483309097
131.69909366598785
130.15660617453108
124.29793612222738
112.10412938730153
94.33671439881526
93.88864077399286
91.28545169659753
87.48138053714118
87.14198437101898
87.1193494804299
83.13468622501036
81.21639325337655
79.76969586574897
75.9904659496986
71.38450205031617
70.6006779472954
70.55998715302361
70.15313022567558
67.93641796546696
67.7477628076899
67.17113471853536
67.01195554935798
66.56432462291492
65.66473896756881
65.10721566458393
64.14221723179793
63.62004140809854
60.141510174833584
56.886499678615934
56.18221606709332
53.08017697750806
52.14225780456537
51.9789227920475
51.700016625438366
51.38592385023241
51.31555445652463
50.25967880566966
50.14255459971789
49.34109735647028
49.142579649904214
48.9689774795293
48.946999727783144
48.78269468980903
48.14145486782515
46.04465138786997
44.74023928354976
43.13723239533322
41.986

## Using `argsort()`
---
- `argsort()` returns index positions from least to greatest.

In [19]:
# Define an array of values index 0 = 10, index 1 = 200, index 2 = 1.
arr = np.array([10, 200, 1])
# Print out the indices after sorting the array from least to greatest, i.e., 1, 10, 200:
print(f"The indices the the array, '10, 200, 1' from least to greatest: {np.argsort(arr)}")
# Reverse the sort from greatest to least.
print(f"The indices the the array, '10, 200, 1' from greatest to least: {np.argsort(-arr)}")

The indices the the array, '10, 200, 1' from least to greatest: [2 0 1]
The indices the the array, '10, 200, 1' from greatest to least: [1 0 2]


In [20]:
# Sort the array of the first topic
first_topic .argsort()


array([ 999, 2107,  713, ..., 2416, 1340, 2901])

In [21]:
# Get the value of the word that is least representative of this topic
print(f"The value of the word that is least representative of this topic is: {first_topic[1716]}")
# Get the value of the word that is most representative of this topic
print(f"The value of the word that is most representative of this topic is: {first_topic[1688]}")

The value of the word that is least representative of this topic is: 0.14294524292003383
The value of the word that is most representative of this topic is: 0.14309734844662608


In [26]:
# Get the indices of the top ten words for the first topic (e.g., top 10 words for topic 0):
top_word = first_topic.argsort()[-10:][::-1]
print(top_word)

[2901 1340 2416 2902 3073  822 2624 1875 1894 2306]


In [27]:
# Get the top ten words from the indices.
for i in top_word:
  print(cv.get_feature_names_out()[i])


trump
house
says
trumps
white
donald
star
nba
news
reveals


In [32]:
# Get the bottom ten words from the indices.
bottom_word = first_topic.argsort()[:10][::-1]
print(bottom_word)
for i in bottom_word:
  print(cv.get_feature_names_out()[i])

[2964 2759 1816 1227 2417 2004 1347  713 2107  999]
valentines
tebow
mississippi
gymnastics
scams
patent
hulu
deathmatch
potter
fargo


In [33]:
# Print the top 20 words for each topic
for i,topic in enumerate(LDA_data.components_):
  print(f"topic {i+1} top 20 words " )
  print ([cv.get_feature_names_out()[j] for j in topic.argsort()[-20:][::-1]])


topic 1 top 20 words 
['trump', 'house', 'says', 'trumps', 'white', 'donald', 'star', 'nba', 'news', 'reveals', 'big', 'bad', 'biden', 'report', 'nfl', 'michael', 'mark', 'fox', 'chris', 'new']
topic 2 top 20 words 
['trump', 'super', 'team', 'bowl', 'sexual', 'twitter', 'gop', 'james', 'says', 'player', 'donald', 'school', 'state', 'trumps', 'olympics', 'new', 'love', 'texas', 'ban', 'stop']
topic 3 top 20 words 
['business', 'women', 'dead', 'says', 'time', 'qa', 'christmas', 'ceo', 'war', 'years', 'family', 'kim', 'perfect', 'director', 'prince', 'kardashian', 'red', 'new', 'story', 'just']
topic 4 top 20 words 
['travel', 'facebook', 'google', 'biden', 'game', 'apple', 'says', 'million', 'want', 'trump', 'black', 'going', 'wont', 'wine', 'data', 'kids', 'stephen', 'change', 'iphone', 'democrats']
topic 5 top 20 words 
['new', 'food', 'video', 'watch', 'know', 'need', 'heres', 'week', 'things', 'best', 'photos', 'recipes', 'york', 'make', 'apple', 'real', 'netflix', 'hotels', 'didnt

### Taking our best guess at the topics.
---
- TOPIC 1: **Travel**
- TOPIC 2: **Sports**
- TOPIC 3: **Food**
- TOPIC 4: **Politics**
- TOPIC 5: **Business**
- TOPIC 6: **Entertainment**
- TOPIC 7: **Technology**

### Assigning the Topic to the Headline

In [34]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).
topic_results = LDA.transform(dtm)

# Get the shape of the topic results
topic_results.shape

(23377, 7)

In [35]:
# Get the first headline's topic probability distribution rounded to 6 decimal places.
print(topic_results[0].round(6))

[0.035795 0.785307 0.035734 0.035714 0.035934 0.035801 0.035714]


In [36]:
# Get the sorted indices for each topic in the first headline.
sorted_topics = np.argsort(-topic_results[0])

# Print the ranking of topics for the headline
for rank, topic_index in enumerate(sorted_topics):
  print(f"rank:{rank+1} - topic: {topic_index+1} - probability:{topic_results[0,topic_index]}")


rank:1 - topic: 2 - probability:0.7853067747245356
rank:2 - topic: 5 - probability:0.03593406112721793
rank:3 - topic: 6 - probability:0.03580059101306713
rank:4 - topic: 1 - probability:0.035795498693943745
rank:5 - topic: 3 - probability:0.035734474471429446
rank:6 - topic: 7 - probability:0.03571430041055481
rank:7 - topic: 4 - probability:0.035714299559251386


In [38]:
# Get the topic with the highest probability.
topic_results[0].argmax() + 1

2

This means that our model thinks that the first article belongs to topic "2".

In [39]:
# Read in our original news headlines.
news_articles_df_2 = pd.read_csv(url_lc)

# Combine the original data with the topic label.
news_articles_df_2['topic'] = (topic_results.argmax(axis=1)+1)

In [40]:
# Get the first 20 rows.
news_articles_df_2.head(20)

Unnamed: 0,headline,topic
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates,2
1,The Only Shopping Guide For Cyber Monday You Need,4
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video,2
3,How To Say 'Cheers' In 20 Languages (AUDIO),7
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection,2
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies,1
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates,6
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts,6
8,The Major Problem With Electric Cars | TIME.com,2
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?,2


In [None]:
# Get the last 20 rows.
