<a href="https://colab.research.google.com/github/sowmyarshetty/NNClass/blob/main/Non_Negative_Matrix_Factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import the dependencies
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Set the column width to 200.
pd.set_option('max_colwidth', 200)

In [2]:
# # Load the news_articles.csv into a DataFrame.
# news_articles_df = pd.read_csv('Resources/news_articles.csv')
# # Display the first 20 headlines
# news_articles_df.head(10)
import gdown
# Mount Google Drive (For Colab Users)
from google.colab import drive
drive.mount('/content/drive')


#URL for downloading lung cancer data
url_lc = 'https://drive.google.com/uc?id=1dCjysGH7CjseeifERUin9gaoKchtcGnh'
gdown.download(url_lc, 'news_article.csv', quiet=False)
# https://drive.google.com/file/d/1dCjysGH7CjseeifERUin9gaoKchtcGnh/view?usp=drive_link


#Read the Lung cancer data.csv
news_articles_df = pd.read_csv(url_lc)
news_articles_df.head()

Mounted at /content/drive


Downloading...
From: https://drive.google.com/uc?id=1dCjysGH7CjseeifERUin9gaoKchtcGnh
To: /content/news_article.csv
100%|██████████| 1.43M/1.43M [00:00<00:00, 12.6MB/s]


Unnamed: 0,headline
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates
1,The Only Shopping Guide For Cyber Monday You Need
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video
3,How To Say 'Cheers' In 20 Languages (AUDIO)
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection


## Preprocess the Text

In [4]:
# Remove digits and non-alphabetic characters
news_articles_df["headline"] = news_articles_df['headline'].apply(lambda x :re.sub(r"[^a-zA-Z\s ]","",str(x)))

## Create a TF-IDF matrix from our documents.

In [5]:
# Create an instance of the TfidfVectorizer and set the max_df to 0.95 and min_df to 10, and use the English stopwords to be ignored.
vectorizer = TfidfVectorizer(max_df=0.95,min_df=10,stop_words='english')

In [6]:
# Transform each row from the headlines Series to a DTM.
dtm = vectorizer.fit_transform(news_articles_df["headline"])
# Get the shape of the DTM.
dtm.shape

(23377, 3149)

In [None]:
# Print the sparse matrix of the transformed data.
# We have 23,377 documents, the first number in the tuple represents the document number.
# The second number in the tuple represents the index of the word in the vocabulary created by fit_transform.
# The last number represents the value of the TF-IDF score for the vocabulary word.


  (0, 183)	0.6345657383532866
  (0, 3131)	0.5447013164784799
  (0, 3138)	0.5482944460185232
  (1, 1881)	0.35546248241584566
  (1, 1831)	0.4901265408973161
  (1, 677)	0.46003283951735585
  (1, 1220)	0.4113174150555592
  (1, 2510)	0.5026033904983209
  (2, 2981)	0.4263827647977379
  (2, 1892)	0.3375155487496725
  (2, 2725)	0.6290303789178769
  (2, 2749)	0.5555193737702372
  (3, 168)	0.8128170960317599
  (3, 2414)	0.5825189854403862
  (4, 1949)	0.3359899169063007
  (4, 2158)	0.4724418120156065
  (4, 2081)	0.38124866781045463
  (4, 2327)	0.3818919620518235
  (4, 1281)	0.4127191813776179
  (4, 3060)	0.4498670580656673
  (5, 1618)	0.35810060060128407
  (5, 1681)	0.3894063836492395
  (5, 1182)	0.4127062165437718
  (5, 277)	0.4026181555145373
  (5, 2228)	0.30472503847033977
  :	:
  (23371, 1702)	0.4830356653587415
  (23371, 2781)	0.4443808362877062
  (23371, 2277)	0.3677085237137301
  (23371, 2963)	0.4044727605382417
  (23371, 255)	0.3064420691643532
  (23372, 2630)	0.452129735249142
  (23372, 

In [7]:
# Get the feature names (words) from the TfidfVectorizer

# Get all the non-zero elements from the first row.


# Get the indices for each non-zero element.


# Print out the word and the number of times the word is in the row.
# Get the feature names (words) from the CountVectorizer
feature_names = vectorizer.get_feature_names_out()

# Get all the non-zero elements from the first row.
non_zero_elements = dtm.toarray()[0]

# Get the indices for each non-zero element.
non_zero_indices = non_zero_elements.nonzero()[0]

# Print out the word and the number of times the word is in the row.
for i in non_zero_indices:
   print(f"The word : {feature_names[i]} - index : {i} - count : {non_zero_elements[i]}")


The word : bachelor - index : 183 - count : 0.6345657383532867
The word : yearold - index : 3131 - count : 0.5447013164784797
The word : young - index : 3138 - count : 0.5482944460185232


## Applying NMF

In [8]:
# Initialize the NMF and set the number of topics to 7.

# Fit the model with our DTM data.

NMF_model = NMF(n_components=7, random_state = 45)
# Fit the model with our DTM data. This may take awhile if you have a large amount of documents.
NMF_data = NMF_model.fit(dtm)


In [9]:
# Get the length of the array of each topic. It should be the same as the vocabulary.
for index, topic in enumerate(NMF_data.components_):
  print(len(NMF_data.components_[index]))

3149
3149
3149
3149
3149
3149
3149


In [11]:
# Get the array of the first topic
first_topic = NMF_data.components_[0]

# This is the ranking of each word in the array. Lower values have less impact than higher values.
print(first_topic)

[0.00013266 0.00022071 0.         ... 0.00100991 0.         0.        ]


In [14]:
# Get the indices of the top ten words for the first topic (e.g., top 10 words for topic 0):
sorted_first_topic = np.argsort(-first_topic)
top_word = first_topic.argsort()[-10:][::-1]
print(top_word)

[ 247 3114   94 1079   97 1336 1960 3115 2295  210]


In [16]:
# Get the top ten words from the indices.
for i in top_word:
  print(vectorizer.get_feature_names_out()[i])

best
world
america
food
americas
hotels
order
worlds
restaurants
bars


In [17]:
# Print the top 30 words for each topic
for i,topic in enumerate(NMF_data.components_):
  print(f"topic {i+1} top 30 words " )
  print ([vectorizer.get_feature_names_out()[j] for j in topic.argsort()[-30:][::-1]])

topic 1 top 30 words 
['best', 'world', 'america', 'food', 'americas', 'hotels', 'order', 'worlds', 'restaurants', 'bars', 'cities', 'places', 'worst', 'time', 'taste', 'deals', 'beaches', 'according', 'deathmatch', 'eat', 'test', 'black', 'ways', 'foods', 'huffpost', 'cheese', 'friday', 'hotel', 'things', 'way']
topic 2 top 30 words 
['new', 'york', 'watch', 'week', 'apple', 'city', 'thats', 'video', 'iphone', 'years', 'videos', 'youtube', 'just', 'world', 'amazon', 'like', 'orleans', 'hulu', 'game', 'year', 'prime', 'netflix', 'heres', 'love', 'shows', 'ad', 'time', 'rumors', 'tech', 'trailer']
topic 3 top 30 words 
['photos', 'recipes', 'make', 'food', 'like', 'dessert', 'love', 'cheese', 'cook', 'eat', 'better', 'order', 'favorite', 'need', 'cream', 'ways', 'things', 'summer', 'worst', 'ice', 'easy', 'time', 'dinner', 'instagram', 'italian', 'coffee', 'taste', 'fall', 'chocolate', 'breakfast']
topic 4 top 30 words 
['trump', 'donald', 'says', 'twitter', 'house', 'trumps', 'biden', 

### Taking our best guess at the topics.
---
- TOPIC 1:
- TOPIC 2:
- TOPIC 3:
- TOPIC 4:
- TOPIC 5:
- TOPIC 6:
- TOPIC 7:

## Assigning the Topic to the Headline

In [None]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).


# Get the shape of the topic results


(23377, 7)

In [None]:
# Get the sorted indices for each topic in the first headline.

# Print the ranking of topics for the headline


Ranking of topics for the first headline:
   Rank 1: Topic 7, Probability: 0.002277
   Rank 2: Topic 4, Probability: 0.001495
   Rank 3: Topic 2, Probability: 0.001041
   Rank 4: Topic 5, Probability: 0.000894
   Rank 5: Topic 3, Probability: 0.000472
   Rank 6: Topic 1, Probability: 0.000284
   Rank 7: Topic 6, Probability: 0.000208


In [None]:
# Read in our original news headlines.

# Combine the original data with the topic label.


In [None]:
# Get the first 10 rows.


Unnamed: 0,headline,topic
0,Is 22 Too Young To Marry A 36-Year-Old? 'The Bachelor' Investigates,7
1,The Only Shopping Guide For Cyber Monday You Need,7
2,Taylor Swift Dances When No One Can See Her In New 'Delicate' Video,2
3,How To Say 'Cheers' In 20 Languages (AUDIO),4
4,'Welcome To Hell': Rio Police Warn They Can't Promise Olympic Protection,4
5,Conservative Pundit Points Out Where Real Blame For GOP’s ‘Descent Into Madness’ Lies,4
6,We Asked The American Public To Settle 5 Of The Internet's Dumbest Debates,4
7,'Teen Mom OG's' Catelynn Lowell Heads To Treatment Over Suicidal Thoughts,4
8,The Major Problem With Electric Cars | TIME.com,4
9,Why Is Nobel-Winning Economist Richard Thaler So Jovial?,4


In [None]:
# Get the last 10 rows.


Unnamed: 0,headline,topic
23367,"These Are 33 Of The Best, Most Iconic American Foods",1
23368,Does Your Marketing Plan Need an Exit Strategy?,7
23369,"Summer Fancy Food Show, Part I",3
23370,7 Reasons to Include Galapagos Islands on Your Bucket List,7
23371,"Biden To Republicans Threatening To Challenge Vaccine, Testing Mandates: ‘Have At It’",4
23372,Biden's Health Agenda Starts With Reversing Everything Trump Did In The Last 4 Years,4
23373,You Know Where You Are From the Very First Bite,7
23374,"9 Cheeses We Would Happily Marry, If That Was Allowed",6
23375,Donald Trump Has A Surprising Response To Golfer Rory McIlroy's Criticism,4
23376,Fast Food Strikes Hit Cities Throughout The Country,3
