In [1]:
# Import dependencies 
import pandas as pd
import numpy as np
import random
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Set the column width
pd.set_option('max_colwidth', 200)

In [2]:
# Load the news_articles.csv into a DataFrame.
news_articles_df = pd.read_csv('Resources/news_articles.csv')
# Display the first 20 headlines 
news_articles_df.head(10)

Unnamed: 0,headline
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video
3,How To Say 'Cheers' In 20 Languages (AUDIO)
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts
8,The Major Problem With Electric Cars | TIME.com
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?


## Preprocess the Text

In [3]:
# Get the info on the DataFrame
news_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23377 entries, 0 to 23376
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   headline  23376 non-null  object
dtypes: object(1)
memory usage: 182.8+ KB


In [4]:
# Remove digits and non-alphabetic characters
news_articles_df['headline'] = news_articles_df['headline'].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', 
                                                                                   '', str(x)))
news_articles_df.head(10)

Unnamed: 0,headline
0,Is Too Young To Marry A YearOld The Bachelor Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New Delicate Video
3,How To Say Cheers In Languages AUDIO
4,Welcome To Hell Rio Police Warn They Cant Promise Olympic Protection
5,Conservative Pundit Points Out Where Real Blame For GOPs Descent Into Madness Lies
6,We Asked The American Public To Settle Of The Internets Dumbest Debates
7,Teen Mom OGs Catelynn Lowell Heads To Treatment Over Suicidal Thoughts
8,The Major Problem With Electric Cars TIMEcom
9,Why Is NobelWinning Economist Richard Thaler So Jovial


## Process the Text to Tokens and Counts.

In [5]:
# Create an instance of the CountVectorizer and set the max_df to 0.95 and min_df to 10, and use the "english" stopwords.
cv = CountVectorizer(max_df=0.95,min_df=10, stop_words='english')
cv

In [6]:
# Get the headlines.
headlines= news_articles_df['headline']
print(headlines)

0                                Is  Too Young To Marry A YearOld The Bachelor Investigates
1                                         The Only Shopping Guide For Cyber Monday You Need
2                         Taylor Swift Dances When No One Can See Her In New Delicate Video
3                                                     How To Say Cheers In  Languages AUDIO
4                      Welcome To Hell Rio Police Warn They Cant Promise Olympic Protection
                                                ...                                        
23372    Bidens Health Agenda Starts With Reversing Everything Trump Did In The Last  Years
23373                                       You Know Where You Are From the Very First Bite
23374                                    Cheeses We Would Happily Marry If That Was Allowed
23375              Donald Trump Has A Surprising Response To Golfer Rory McIlroys Criticism
23376                                   Fast Food Strikes Hit Cities Throughout 

In [7]:
# Transform each row from the headlines Series to a DTM.
dtm = cv.fit_transform(headlines)

# Get the shape of the DTM.
print(dtm.shape)

(23377, 3149)


In [8]:
# Get the length of the vocabulary 
len(cv.get_feature_names_out())

3149

In [9]:
# Look at 100 random words in the vocabulary
print(cv.get_feature_names_out()[:100])

['aaron' 'abandoned' 'ability' 'able' 'abortion' 'abroad' 'absolutely'
 'abuse' 'abused' 'access' 'accident' 'accidental' 'accidentally'
 'according' 'account' 'accounts' 'accusations' 'accused' 'accuser'
 'accusers' 'accuses' 'act' 'acting' 'action' 'activists' 'actor'
 'actress' 'actually' 'ad' 'adam' 'add' 'added' 'adding' 'address' 'adds'
 'adele' 'administration' 'admits' 'adorable' 'ads' 'adults' 'adventure'
 'adventures' 'advice' 'adviser' 'advocates' 'affleck' 'afford'
 'affordable' 'afghanistan' 'africa' 'age' 'agency' 'agenda' 'agent' 'ago'
 'agree' 'agrees' 'ahead' 'aid' 'aide' 'aides' 'aim' 'aims' 'air' 'airbnb'
 'airline' 'airlines' 'airplane' 'airport' 'airports' 'al' 'alabama'
 'alaska' 'album' 'alec' 'alex' 'alexandria' 'ali' 'alive' 'allegations'
 'alleged' 'allegedly' 'allen' 'allies' 'allow' 'allowed' 'allstar'
 'alternative' 'amazing' 'amazon' 'amazons' 'ambassador' 'amber' 'america'
 'american' 'americans' 'americas' 'amid' 'amy']


In [10]:
# Print the first 500 elements (transformed words)from the 1st row, i.e., document. 
print(dtm.toarray()[0][:500])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [11]:
# Get the feature names (words) from the CountVectorizer
feature_names = cv.get_feature_names_out()


# Get all the non-zero elements from the first row.
non_zero_elements = dtm.toarray()[0]

# Get the indices for each non-zero element.
non_zero_indices = non_zero_elements.nonzero()[0]

# Print out the word and the number of times the word is in the row. 
for idx in non_zero_indices:
    print(f"Word: {feature_names[idx]} | Word index {idx} | Count = {non_zero_elements[idx]}")

Word: bachelor | Word index 183 | Count = 1
Word: yearold | Word index 3131 | Count = 1
Word: young | Word index 3138 | Count = 1


In [11]:
# Convert the DTM to a DataFrame
dtm_df = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())

# Display some random columns and the first 20 rows of the DataFrame.
dtm_df.iloc[:,180:195:].head(10)

Unnamed: 0,awesome,awkward,baby,bachelor,backlash,backs,bacon,bad,bag,baking,baldwin,ball,balls,ban,banana
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## LDA

In [12]:
# Create and instance of the LatentDirichletAllocation() class with 5 topics.
LDA = LatentDirichletAllocation(n_components=7, random_state=42)


# Fit the model with our DTM data. This may take awhile if you have a large amount of documents.
LDA_data = LDA.fit(dtm)

In [14]:
# LDA.fit(dtm_df)

In [15]:
# Get the values of each topic-word distribution.
topic_word_distributions = LDA.components_
print(topic_word_distributions)

[[ 0.14321209  0.14294539  0.14305075 ...  0.14288243  0.14290029
   0.14292816]
 [ 0.14333838  0.14308867  0.14287735 ...  0.16051576  0.1428573
   0.14291071]
 [ 0.14292884 15.1399795   0.14285738 ...  7.12553361 10.1427071
   0.14305028]
 ...
 [22.65736546  0.14299106  0.14302794 ...  0.14296461  0.14293316
  11.02329474]
 [ 0.62705853  0.14307883 10.14179871 ...  0.14305528  0.14288757
  14.26188787]
 [ 0.14296621  0.14505918  0.14304493 ...  0.14296954  0.1428573
   0.14288724]]


In [16]:
topic_word_distributions.shape

(7, 3149)

In [17]:
# Get the length of the array of each topic. It should be the same as the vocabulary.
for index,topic in enumerate(LDA.components_):
    print(len(LDA.components_[index]))

3149
3149
3149
3149
3149
3149
3149


In [24]:
# topic_word_distributions.sum(axis=1)

In [25]:
# Get the array of the first topic 
first_topic = LDA.components_[0]


# This is the ranking of each word in the array. Lower values have less impact than higher values.
first_topic

array([0.14321209, 0.14294539, 0.14305075, ..., 0.14288243, 0.14290029,
       0.14292816])

In [26]:
first_topic.shape

(3149,)

In [28]:
# Get the indices for the first topic in descending order.
sorted_first_topic_indices = np.argsort(-first_topic)

# Use the sorted indices to the values from greatest to least.
sorted_first_topic_values = first_topic[sorted_first_topic_indices]

for i in sorted_first_topic_values:
    print(i)

386.8946721456929
303.8724611834139
239.8867757811359
202.98456444306802
196.60982818851144
179.1763202843372
176.20552862733817
144.33338539023154
136.06560230840088
129.74678659811823
126.81867004037085
124.05126006706051
102.14204684517696
102.12488814292759
93.56699528792718
92.8891360842591
90.29553589123128
89.29738642533368
88.15565926932933
87.30612192464483
86.4913700850266
86.33677637387935
84.40872607882477
82.56347769399977
81.38158465336727
80.57818773849942
80.52972098529152
79.29385515136156
79.11614819598432
76.25157499323097
74.19715849295667
71.14202433209842
67.9134509381027
64.62417970088529
63.35410326469403
63.237365932625046
61.51219006568088
61.30446953698129
60.89796909780544
60.59811863083581
59.142446005972545
58.22426756107582
56.61997089344968
56.02333389832764
55.782699678394216
54.84830404590975
54.1315834444488
53.14743293665913
53.142097708187066
52.14221566765927
51.93067904319547
51.352302331563
51.194632786996436
50.84039768235432
50.65370209351697
5

## Using `argsort()`
---
- `argsort()` returns index positions from least to greatest.

In [18]:
# Define an array of values index 0 = 10, index 1 = 200, index 2 = 1.
arr = np.array([10, 200, 1])
# Print out the indices after sorting the array from least to greatest, i.e., 1, 10, 200:
print(f"The indices the the array, '10, 200, 1' from least to greatest: {np.argsort(arr)}")
# Reverse the sort from greatest to least. 
print(f"The indices the the array, '10, 200, 1' from greatest to least: {np.argsort(-arr)}")

The indices the the array, '10, 200, 1' from least to greatest: [2 0 1]
The indices the the array, '10, 200, 1' from greatest to least: [1 0 2]


In [29]:
# Sort the array of the first topic
first_topic.argsort()

array([1716,  999, 1098, ..., 2801, 2869, 1688])

In [30]:
# Get the value of the word that is least representative of this topic
print(f"The value of the word that is least representative of this topic is: {first_topic[1716]}")
# Get the value of the word that is most representative of this topic
print(f"The value of the word that is most representative of this topic is: {first_topic[1688]}")

The value of the word that is least representative of this topic is: 0.14285722574347814
The value of the word that is most representative of this topic is: 386.8946721456929


In [33]:
# Get the indices of the top ten words for the first topic (e.g., top 10 words for topic 0):
top_word_indices = first_topic.argsort()[-10:][::-1]
print(top_word_indices)

[1688 2869 2801 3042  247 1544 3041  250 2830 2233]


In [34]:
# Get the top ten words from the indices. 
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

make
travel
things
ways
best
know
way
better
tips
reasons


In [35]:
# Get the bottom ten words from the indices.
bottom_word_indices = first_topic.argsort()[:10][::-1]
for index in bottom_word_indices:
    print(cv.get_feature_names_out()[index])

carreys
wells
manziel
mitch
marie
mcconnell
greene
franco
fargo
marjorie


In [36]:
# Print the top 20 words for each topic
for index,topic in enumerate(LDA.components_):
    print(f"The Top 20 Words For Topic #{index+1}")
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-20:][::-1]])
    print('\n')

The Top 20 Words For Topic #1
['make', 'travel', 'things', 'ways', 'best', 'know', 'way', 'better', 'tips', 'reasons', 'dont', 'world', 'vacation', 'tom', 'perfect', 'free', 'just', 'didnt', 'work', 'home']


The Top 20 Words For Topic #2
['like', 'nfl', 'new', 'national', 'player', 'state', 'great', 'looks', 'google', 'pay', 'football', 'guide', 'soccer', 'dies', 'players', 'billion', 'police', 'deal', 'sports', 'womens']


The Top 20 Words For Topic #3
['photos', 'food', 'best', 'recipes', 'need', 'dead', 'want', 'recipe', 'places', 'eat', 'worlds', 'foods', 'america', 'right', 'life', 'visit', 'youll', 'new', 'day', 'high']


The Top 20 Words For Topic #4
['trump', 'says', 'game', 'biden', 'donald', 'james', 'gop', 'sexual', 'john', 'joe', 'report', 'twitter', 'president', 'election', 'rep', 'million', 'love', 'house', 'just', 'lebron']


The Top 20 Words For Topic #5
['business', 'women', 'video', 'world', 'facebook', 'best', 'ceo', 'house', 'watch', 'qa', 'white', 'black', 'good',

### Taking our best guess at the topics.
---
- TOPIC 1: **Travel**
- TOPIC 2: **Sports**
- TOPIC 3: **Food**
- TOPIC 4: **Politics**
- TOPIC 5: **Business**
- TOPIC 6: **Entertainment**
- TOPIC 7: **Technology**

### Assigning the Topic to the Headline

In [38]:
dtm

<23377x3149 sparse matrix of type '<class 'numpy.int64'>'
	with 107082 stored elements in Compressed Sparse Row format>

In [39]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).
topic_results = LDA.transform(dtm)

# Get the shape of the topic results
topic_results.shape

(23377, 7)

In [44]:
news_articles_df.iloc[0]

headline    Is  Too Young To Marry A YearOld The Bachelor Investigates
Name: 0, dtype: object

In [42]:
# Get the first headline's topic probability distribution rounded to 6 decimal places. 
topic_results[0]

array([0.5786167 , 0.24236212, 0.03575719, 0.03574884, 0.0357143 ,
       0.03577465, 0.0360262 ])

In [45]:
# Get the sorted indices for each topic in the first headline.
sorted_indices = np.argsort(-topic_results[0])
# Print the ranking of topics for the headline
print("Ranking of topics for the first headline:")
for rank, topic_index in enumerate(sorted_indices):
    print(f"   Rank {rank+1}: Topic {topic_index+1}, Probability: {topic_results[0, topic_index]:.6f}")

Ranking of topics for the first headline:
   Rank 1: Topic 1, Probability: 0.578617
   Rank 2: Topic 2, Probability: 0.242362
   Rank 3: Topic 7, Probability: 0.036026
   Rank 4: Topic 6, Probability: 0.035775
   Rank 5: Topic 3, Probability: 0.035757
   Rank 6: Topic 4, Probability: 0.035749
   Rank 7: Topic 5, Probability: 0.035714


In [46]:
# Get the topic with the highest probability. 
topic_results[0].argmax()+1

1

This means that our model thinks that the first article belongs to topic "2".

In [47]:
# Read in our original news headlines. 
news_articles_df_2 = pd.read_csv('Resources/news_articles.csv')

# Combine the original data with the topic label. 
news_articles_df_2['topic'] = (topic_results.argmax(axis=1)+1)

In [49]:
# Get the first 20 rows. 
news_articles_df_2.head(20)

Unnamed: 0,headline,topic
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates,1
1,The Only Shopping Guide For Cyber Monday You Need,2
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video,6
3,How To Say 'Cheers' In 20 Languages (AUDIO),4
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection,7
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies,6
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates,5
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts,2
8,The Major Problem With Electric Cars | TIME.com,5
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?,2


In [50]:
# Get the last 20 rows.
news_articles_df_2.tail(20)

Unnamed: 0,headline,topic
23357,'Once Upon a Time' Ginnifer Goodwin's Favorite Healthy Recipe,3
23358,Aaron Paul And Lauren Parsekian Welcome First Child Together,5
23359,Photographer Anthony Tortoriello Captures Pipeline From Above,5
23360,Princess Beatrice Marries Edoardo Mapelli Mozzi In Front Of The Queen,2
23361,Hollywood Celebrities React To Bill Cosby Guilty Verdict,7
23362,Aly Raisman Just Accepted A Date With An Oakland Raider,7
23363,Hidden-Camera Video Reveals Chicken McNuggets' ‘Disturbing Secret’,5
23364,Rep. Madison Cawthorn Was Stopped Trying To Bring Gun On Plane In February,7
23365,"Dear Millennials, From Baby Boomers (And Vice Versa)",1
23366,9 Party Beaches You'll Want To Celebrate Life At This Summer,3
