### Reference

[Tutorial: Extracting Keywords with TF-IDF and Python’s Scikit-Learn](https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X8Ysl1lKhNg) &mdash; Kavita Ganesan

In [1]:
import feedparser, time, datetime

In [76]:
import pandas as pd

In [141]:
from sklearn.feature_extraction.text import CountVectorizer
import re
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
 
import re
def pre_process(text):
    # lowercase
    text=text.lower()
    #remove tags
    text=re.sub("","",text)
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    # remove numbers
    text = re.sub(r'[0-9\,$]+', '', text)
    return text

def tolist(text):
    return text.split()

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [122]:
hdlines_df = pd.read_csv('2020-headlines.csv') 
hdlines_df.dropna(inplace=True)
hdlines_df.drop(columns=['SNO', 'Website'], inplace=True)
hdlines_df
df_idf=hdlines_df
df_idf

Unnamed: 0,News
0,French daily coronavirus death toll beyond the...
1,3 charts that show the U.S. restaurant industr...
2,Farmer at centre of COVID-19 outbreak spent $7...
3,Video calls connect anxious parents to hospita...
4,Coronavirus: London key workers to star on cov...
...,...
141203,Chinese doctors wear hazmat suits to treat cor...
141204,Dow Jones falls 150 points due to US Chinese c...
141205,CDC Confirms Second Case Of Coronavirus In The...
141206,Queues to buy face masks in China as Wuhan cor...


In [123]:
df_idf['text'] = df_idf['News']
df_idf['text'] = df_idf['text'].apply(lambda x:pre_process(x))
df_idf['text']
docs = df_idf['text']
docs

0         french daily coronavirus death toll beyond the...
1          charts that show the u s restaurant industry ...
2         farmer at centre of covid outbreak spent to ho...
3         video calls connect anxious parents to hospita...
4         coronavirus london key workers to star on cove...
                                ...                        
141203    chinese doctors wear hazmat suits to treat cor...
141204    dow jones falls points due to us chinese coron...
141205    cdc confirms second case of coronavirus in the...
141206    queues to buy face masks in china as wuhan cor...
141207    parents abandon their two children at chinese ...
Name: text, Length: 141208, dtype: object

In [160]:
from collections import defaultdict

wordsFreq = defaultdict(int)
lines = docs.tolist()
for line in lines:
    words = line.split()
    for word in words:
        if len(word) > 3:
            wordsFreq[word] += 1


sorted(wordsFreq.items(), key=lambda k_v: k_v[1], reverse=True)[:20]

[('coronavirus', 92608),
 ('covid', 13557),
 ('from', 9965),
 ('says', 9774),
 ('with', 9085),
 ('trump', 8986),
 ('lockdown', 8684),
 ('after', 8381),
 ('cases', 7467),
 ('china', 7318),
 ('pandemic', 6873),
 ('amid', 6648),
 ('over', 6409),
 ('virus', 5119),
 ('will', 4982),
 ('outbreak', 4712),
 ('death', 4610),
 ('more', 4467),
 ('during', 4276),
 ('could', 3994)]

In [125]:
#instantiate CountVectorizer() 
cv=CountVectorizer(max_df=0.85,stop_words='english')
word_count_vector=cv.fit_transform(df_idf['News'])
word_count_vector.shape
# word_count_vector

word_count_vector=cv.fit_transform(docs)

In [126]:
word_count_vector.shape

(141208, 33003)

In [131]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [132]:
# you only needs to do this once, this is a mapping of index to 
feature_names=cv.get_feature_names()

In [145]:
class Feed:
    name = ''
    url = None
    max_delay = None
    def __init__(self, name, url, max_delay):
        self.name = name
        self.url  = url
        self.max_delay = max_delay
        
    def __str__(self):
        return '%s: %s' % (self.name, self.url)

    def getHeadline(self):
        d = feedparser.parse (self.url)
        for post in d.entries:
            # time.sleep(0.125)
            ret = (datetime.datetime.now().time(), self.name, post.title, post.link)
            yield ret

#

feeds = (
    Feed('nyt-home', 'https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml', 1),
    Feed('nyt-sun', 'https://rss.nytimes.com/services/xml/rss/nyt/sunday-review.xml', 1),
    Feed('nyt-hlth', 'https://rss.nytimes.com/services/xml/rss/nyt/Health.xml', 1),
    Feed('nyt-wrld', 'https://www.nytimes.com/section/world/rss.xml', 1),
    Feed('nyt-bsns', 'http://feeds.nytimes.com/nyt/rss/Business', 1),
    Feed('nyt-tech', 'http://feeds.nytimes.com/nyt/rss/Technology', 1),
    Feed('nyt-sprt', 'https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml', 1),
    Feed('nyt-scnc', 'http://www.nytimes.com/services/xml/rss/nyt/Science.xml', 1),
    Feed('nyt-arts', 'https://rss.nytimes.com/services/xml/rss/nyt/Arts.xml', 1),
    Feed('nyt-trvl', 'https://rss.nytimes.com/services/xml/rss/nyt/Travel.xml', 1),
    Feed('nyt-usa',  'http://www.nytimes.com/services/xml/rss/nyt/US.xml', 1),
    Feed('bbc-hlth', 'http://feeds.bbci.co.uk/news/health/rss.xml', 1),
    Feed('bbc-brkn', 'https://bbcbreakingnews.com/feed', 1),
    Feed('bbc-bsns', 'http://feeds.bbci.co.uk/news/business/rss.xml', 1),
    Feed('bbc-pltc', 'http://feeds.bbci.co.uk/news/politics/rss.xml', 1),
    Feed('bbc-educ', 'http://feeds.bbci.co.uk/news/education/rss.xml', 1),
    Feed('bbc-scnc', 'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml', 1),
    Feed('bbc-tech', 'http://feeds.bbci.co.uk/news/technology/rss.xml', 1),
    Feed('bbc-arts', 'http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml', 1),
    Feed('aljz-ra', 'http://www.aljazeera.com/xml/rss/all.xml', 1),
    Feed('aljz-ra', 'http://www.aljazeera.com/xml/rss/all.xml', 1),
)

titles = []
for feed in feeds:
    for (tt, name, title, link) in feed.getHeadline():
        print(title)
        print(pre_process(title))
        test_doc = pre_process(title)
        doc = tolist(test_doc)
        #generate tf-idf for the given document
        tf_idf_vector = tfidf_transformer.transform(cv.transform(doc))
        
        #sort the tf-idf vectors by descending order of scores
        sorted_items=sort_coo(tf_idf_vector.tocoo())

        #extract only the top n; n here is 10
        keywords=extract_topn_from_vector(feature_names,sorted_items,5)

        # now print the results
        print("\n=====Doc=====")
        print(doc)
        print("\n===Keywords===")
        for k in keywords:
            print(k,keywords[k])
        
        # titles.append(title)
        # print (tt, name, title, link)
        break

Vaccines Are Coming, but Pandemic Experts Expect a 'Horrible' Winter
vaccines are coming but pandemic experts expect a horrible winter

=====Doc=====
['vaccines', 'are', 'coming', 'but', 'pandemic', 'experts', 'expect', 'a', 'horrible', 'winter']

===Keywords===
winter 1.0
vaccines 1.0
pandemic 1.0
horrible 1.0
experts 1.0
Pandemic-Proof Your Habits
pandemic proof your habits

=====Doc=====
['pandemic', 'proof', 'your', 'habits']

===Keywords===
proof 1.0
pandemic 1.0
habits 1.0
Vaccines Are Coming, but Pandemic Experts Expect a 'Horrible' Winter
vaccines are coming but pandemic experts expect a horrible winter

=====Doc=====
['vaccines', 'are', 'coming', 'but', 'pandemic', 'experts', 'expect', 'a', 'horrible', 'winter']

===Keywords===
winter 1.0
vaccines 1.0
pandemic 1.0
horrible 1.0
experts 1.0
‘Bleak Friday’ for Stores as Pandemic Pushes Holiday Shopping Online
 bleak friday for stores as pandemic pushes holiday shopping online

=====Doc=====
['bleak', 'friday', 'for', 'stores', 'a

In [None]:
docs_test=df_test['text'].tolist()

In [130]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(), columns=["idf_weights"]) 
 
# sort ascending 
# df_idf2 = df_idf[df_idf['idf_weights'] > 3.5]
df_idf2.sort_values(by=['idf_weights'])

AttributeError: 'TfidfTransformer' object has no attribute '_idf_diag'

In [118]:
len(cv.get_feature_names())

33283

In [69]:
# count matrix 
count_vector=cv.transform(docs) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [147]:
n = 0
feature_names = cv.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[n] 
print (docs[n])
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df2 = df[df['tfidf'] > 0]
df2.sort_values(by=["tfidf"],ascending=False)

french daily coronavirus death toll beyond the mark


Unnamed: 0,tfidf
saudi,1.0


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)