In [2]:
# Import dependencies 
import pandas as pd
import numpy as np
import random
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Set the column width
pd.set_option('max_colwidth', 200)

In [3]:
# Load the news_articles.csv into a DataFrame.
news_articles_df = pd.read_csv('Resources/news_articles.csv')
# Display the first 20 headlines 
news_articles_df.head(10)

Unnamed: 0,headline
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video
3,How To Say 'Cheers' In 20 Languages (AUDIO)
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts
8,The Major Problem With Electric Cars | TIME.com
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?


## Preprocess the Text

In [4]:
# Get the info on the DataFrame
news_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23377 entries, 0 to 23376
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  23376 non-null  object
dtypes: object(1)
memory usage: 182.8+ KB


In [5]:
# Remove digits and non-alphabetic characters
news_articles_df['headline'] = news_articles_df['headline'].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', 
                                                                                   '', str(x)))
news_articles_df.head(10)

Unnamed: 0,headline
0,Is Too Young To Marry A YearOld The Bachelor Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New Delicate Video
3,How To Say Cheers In Languages AUDIO
4,Welcome To Hell Rio Police Warn They Cant Promise Olympic Protection
5,Conservative Pundit Points Out Where Real Blame For GOPs Descent Into Madness Lies
6,We Asked The American Public To Settle Of The Internets Dumbest Debates
7,Teen Mom OGs Catelynn Lowell Heads To Treatment Over Suicidal Thoughts
8,The Major Problem With Electric Cars TIMEcom
9,Why Is NobelWinning Economist Richard Thaler So Jovial


## Process the Text to Tokens and Counts.

In [6]:
# Create an instance of the CountVectorizer and set the max_df to 0.95 and min_df to 10, and use the "english" stopwords.
cv = CountVectorizer(max_df=0.95,min_df=10, stop_words='english')
cv

In [7]:
# Get the headlines.
headlines= news_articles_df['headline']
print(headlines)

0                                Is  Too Young To Marry A YearOld The Bachelor Investigates
1                                         The Only Shopping Guide For Cyber Monday You Need
2                         Taylor Swift Dances When No One Can See Her In New Delicate Video
3                                                     How To Say Cheers In  Languages AUDIO
4                      Welcome To Hell Rio Police Warn They Cant Promise Olympic Protection
                                                ...                                        
23372    Bidens Health Agenda Starts With Reversing Everything Trump Did In The Last  Years
23373                                       You Know Where You Are From the Very First Bite
23374                                    Cheeses We Would Happily Marry If That Was Allowed
23375              Donald Trump Has A Surprising Response To Golfer Rory McIlroys Criticism
23376                                   Fast Food Strikes Hit Cities Throughout 

In [8]:
# Transform each row from the headlines Series to a DTM.
dtm = cv.fit_transform(headlines)
# Get the shape of the DTM.
print(dtm.shape)

(23377, 3149)


In [9]:
# Get the length of the vocabulary 
len(cv.get_feature_names_out())

3149

In [10]:
# Look at 100 random words in the vocabulary
print(cv.get_feature_names_out()[:100])

['aaron' 'abandoned' 'ability' 'able' 'abortion' 'abroad' 'absolutely'
 'abuse' 'abused' 'access' 'accident' 'accidental' 'accidentally'
 'according' 'account' 'accounts' 'accusations' 'accused' 'accuser'
 'accusers' 'accuses' 'act' 'acting' 'action' 'activists' 'actor'
 'actress' 'actually' 'ad' 'adam' 'add' 'added' 'adding' 'address' 'adds'
 'adele' 'administration' 'admits' 'adorable' 'ads' 'adults' 'adventure'
 'adventures' 'advice' 'adviser' 'advocates' 'affleck' 'afford'
 'affordable' 'afghanistan' 'africa' 'age' 'agency' 'agenda' 'agent' 'ago'
 'agree' 'agrees' 'ahead' 'aid' 'aide' 'aides' 'aim' 'aims' 'air' 'airbnb'
 'airline' 'airlines' 'airplane' 'airport' 'airports' 'al' 'alabama'
 'alaska' 'album' 'alec' 'alex' 'alexandria' 'ali' 'alive' 'allegations'
 'alleged' 'allegedly' 'allen' 'allies' 'allow' 'allowed' 'allstar'
 'alternative' 'amazing' 'amazon' 'amazons' 'ambassador' 'amber' 'america'
 'american' 'americans' 'americas' 'amid' 'amy']


In [11]:
# Print the first 500 elements (transformed words)from the 1st row, i.e., document. 
print(dtm.toarray()[0][:500])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [12]:
# Get the feature names (words) from the CountVectorizer
feature_names = cv.get_feature_names_out()

# Get all the non-zero elements from the first row.
non_zero_elements = dtm.toarray()[0]

# Get the indices for each non-zero element.
non_zero_indices = non_zero_elements.nonzero()[0]

# Print out the word and the number of times the word is in the row. 
for idx in non_zero_indices:
    print(f"Word: {feature_names[idx]} | Word index {idx} | Count = {non_zero_elements[idx]}")

Word: bachelor | Word index 183 | Count = 1
Word: yearold | Word index 3131 | Count = 1
Word: young | Word index 3138 | Count = 1


In [13]:
# Convert the DTM to a DataFrame
dtm_df = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())

# Display some random columns and the first 20 rows of the DataFrame.
dtm_df.iloc[:,180:195:].head(10)

Unnamed: 0,awesome,awkward,baby,bachelor,backlash,backs,bacon,bad,bag,baking,baldwin,ball,balls,ban,banana
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## LDA

In [14]:
# Pick 7 topics to start with `n_components=7`
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
# Fit the model with our DTM data. This may take awhile if you have a large amount of documents.
LDA_data = LDA.fit(dtm)

In [15]:
# Get the values of each topic-word distribution.
topic_word_distributions = LDA.components_
print(topic_word_distributions)

[[ 0.143217    0.14294557  0.14306439 ...  0.1428827   0.14289057
   0.14293477]
 [ 0.14332786  0.14309587  0.14288107 ...  0.14310538  0.14285729
   0.14291224]
 [ 0.14293411 15.14200616  0.14285739 ...  7.14309948 10.14275005
   0.14301088]
 ...
 [23.03928499  0.14298589  0.14303047 ...  0.14296974  0.14290313
  10.03799117]
 [ 0.24516125  0.14307613 10.14177916 ...  0.14303993  0.1428844
  15.24715296]
 [ 0.14296445  0.143033    0.14303783 ...  0.1430056   0.14285729
   0.14295204]]


In [17]:
topic_word_distributions.shape

(7, 3149)

In [18]:
# Get the length of the array of each topic. It should be the same as the vocabulary.
for index,topic in enumerate(LDA.components_):
    print(len(LDA.components_[index]))

3149
3149
3149
3149
3149
3149
3149


In [19]:
# Get the array of the first topic 
first_topic = LDA.components_[0]
# This is the ranking of each word in the array. Lower values have less impact than higher values.
print(first_topic)

[0.143217   0.14294557 0.14306439 ... 0.1428827  0.14289057 0.14293477]


In [20]:
# Get the indices for the first topic in descending order.
sorted_first_topic_indices = np.argsort(-first_topic)

# Use the sorted indices to the values from greatest to least.
sorted_first_topic_values = first_topic[sorted_first_topic_indices]
for value in sorted_first_topic_values:
    print(value)

390.507603043464
310.30697188787036
251.20035978318657
201.88574335192388
196.71334727826869
182.72905588921574
182.43171557429534
148.02251499683888
136.13988584944423
133.94987965721387
133.30984218198228
129.83415949350857
102.14207599606854
102.14164893964859
96.68787869045279
96.13100278887252
94.87495404472263
94.53848162905375
92.68619962812035
90.74388827831194
90.35285453507389
89.7117449799848
88.02196255426469
86.31832526877487
83.82076454342808
83.72677107893034
83.13443588218702
80.62466999575497
77.39733127087273
76.8559314956792
75.74268925942414
71.14201487641492
68.138194617302
67.54972010499759
65.09924221997575
63.93933122466377
63.16016646710924
62.42473047198901
61.41204759846399
61.11790467320189
61.07513542913345
59.142434339466725
58.05419907652891
55.348671194265094
55.08014815376058
54.942769513704775
54.61180448868518
54.339505253802635
54.14204680035073
53.828496954133655
53.14211659179609
52.64517166762143
52.48566235653217
52.18154210808118
52.142197334705

## Using `argsort()`
---
- `argsort()` returns index positions from least to greatest.

In [18]:
# Define an array of values index 0 = 10, index 1 = 200, index 2 = 1.
arr = np.array([10, 200, 1])
# Print out the indices after sorting the array from least to greatest, i.e., 1, 10, 200:
print(f"The indices the the array, '10, 200, 1' from least to greatest: {np.argsort(arr)}")
# Reverse the sort from greatest to least. 
print(f"The indices the the array, '10, 200, 1' from greatest to least: {np.argsort(-arr)}")

The indices the the array, '10, 200, 1' from least to greatest: [2 0 1]
The indices the the array, '10, 200, 1' from greatest to least: [1 0 2]


In [22]:
# Sort the array of the first topic
first_topic.argsort()

array([1716, 1868,  999, ..., 2801, 2869, 1688])

In [20]:
# Get the value of the word that is least representative of this topic
print(f"The value of the word that is least representative of this topic is: {first_topic[1716]}")
# Get the value of the word that is most representative of this topic
print(f"The value of the word that is most representative of this topic is: {first_topic[1688]}")

The value of the word that is least representative of this topic is: 0.14285722531681794
The value of the word that is most representative of this topic is: 390.507603043464


In [23]:
# Get the indices of the top ten words for the first topic (e.g., top 10 words for topic 0):
top_word_indices = first_topic.argsort()[-10:][::-1]
print(top_word_indices)

[1688 2869 2801 3042  247 3041 1544  250 2830  823]


In [24]:
# Get the top ten words from the indices. 
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

make
travel
things
ways
best
way
know
better
tips
dont


In [25]:
# Get the bottom ten words from the indices.
bottom_word_indices = first_topic.argsort()[:10][::-1]
for index in bottom_word_indices:
    print(cv.get_feature_names_out()[index])

manziel
mitch
marie
mcconnell
greene
franco
meter
fargo
nassar
marjorie


In [26]:
# Print the top 20 words for each topic
for index,topic in enumerate(LDA.components_):
    print(f"The Top 20 Words For Topic #{index+1}")
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-20:][::-1]])
    print('\n')

The Top 20 Words For Topic #1
['make', 'travel', 'things', 'ways', 'best', 'way', 'know', 'better', 'tips', 'dont', 'world', 'reasons', 'vacation', 'tom', 'work', 'perfect', 'didnt', 'free', 'just', 'heres']


The Top 20 Words For Topic #2
['like', 'nfl', 'new', 'national', 'state', 'player', 'football', 'great', 'google', 'looks', 'pay', 'guide', 'soccer', 'dies', 'players', 'police', 'billion', 'deal', 'sports', 'womens']


The Top 20 Words For Topic #3
['photos', 'food', 'best', 'recipes', 'need', 'dead', 'want', 'recipe', 'places', 'worlds', 'foods', 'america', 'eat', 'day', 'right', 'life', 'wine', 'visit', 'youll', 'cake']


The Top 20 Words For Topic #4
['trump', 'says', 'biden', 'game', 'donald', 'james', 'gop', 'john', 'sexual', 'joe', 'report', 'twitter', 'election', 'president', 'rep', 'house', 'million', 'covid', 'lebron', 'obama']


The Top 20 Words For Topic #5
['business', 'women', 'video', 'facebook', 'world', 'best', 'ceo', 'watch', 'house', 'qa', 'black', 'white', 'ub

### Taking our best guess at the topics.
---
- TOPIC 1: **Travel**
- TOPIC 2: **Sports**
- TOPIC 3: **Food**
- TOPIC 4: **Politics**
- TOPIC 5: **Business**
- TOPIC 6: **Entertainment**
- TOPIC 7: **Technology**

### Assigning the Topic to the Headline

In [27]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).
topic_results = LDA.transform(dtm)

# Get the shape of the topic results
topic_results.shape

(23377, 7)

In [28]:
# Get the first headline's topic probability distribution rounded to 6 decimal places. 
print(topic_results[0].round(6))

[0.035997 0.494534 0.035804 0.035731 0.035714 0.035751 0.326469]


In [29]:
# Get the sorted indices for each topic in the first headline.
sorted_indices = np.argsort(-topic_results[0])
# Print the ranking of topics for the headline
print("Ranking of topics for the first headline:")
for rank, topic_index in enumerate(sorted_indices):
    print(f"   Rank {rank+1}: Topic {topic_index+1}, Probability: {topic_results[0, topic_index]:.6f}")

Ranking of topics for the first headline:
   Rank 1: Topic 2, Probability: 0.494534
   Rank 2: Topic 7, Probability: 0.326469
   Rank 3: Topic 1, Probability: 0.035997
   Rank 4: Topic 3, Probability: 0.035804
   Rank 5: Topic 6, Probability: 0.035751
   Rank 6: Topic 4, Probability: 0.035731
   Rank 7: Topic 5, Probability: 0.035714


In [30]:
# Get the topic with the highest probability. 
topic_results[0].argmax()+1

2

This means that our model thinks that the first article belongs to topic "2".

In [31]:
# Read in our original news headlines. 
news_articles_df_2 = pd.read_csv('Resources/news_articles.csv')
# Display the first 20 headlines 
news_articles_df_2.head(20)
# Combine the original data with the topic label. 
news_articles_df_2['topic'] = (topic_results.argmax(axis=1)+1)

In [32]:
# Get the first 20 rows. 
news_articles_df_2.head(10)

Unnamed: 0,headline,topic
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates,2
1,The Only Shopping Guide For Cyber Monday You Need,2
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video,6
3,How To Say 'Cheers' In 20 Languages (AUDIO),4
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection,7
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies,6
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates,5
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts,2
8,The Major Problem With Electric Cars | TIME.com,5
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?,2


In [34]:
news_articles_df_2.tail(10)

Unnamed: 0,headline,topic
23367,"These Are 33 Of The Best, Most Iconic American Foods",3
23368,Does Your Marketing Plan Need an Exit Strategy?,1
23369,"Summer Fancy Food Show, Part I",3
23370,7 Reasons to Include Galapagos Islands on Your Bucket List,1
23371,"Biden To Republicans Threatening To Challenge Vaccine, Testing Mandates: ‘Have At It’",4
23372,Biden's Health Agenda Starts With Reversing Everything Trump Did In The Last 4 Years,4
23373,You Know Where You Are From the Very First Bite,1
23374,"9 Cheeses We Would Happily Marry, If That Was Allowed",5
23375,Donald Trump Has A Surprising Response To Golfer Rory McIlroy's Criticism,6
23376,Fast Food Strikes Hit Cities Throughout The Country,3
