In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk import sent_tokenize

In [3]:
text = open('news.txt','r').read()
sentences = sent_tokenize(text)
total_documents = len(sentences)

In [4]:
vectorizer = TfidfVectorizer(stop_words="english")
transformed = vectorizer.fit_transform(sentences)

In [5]:
print(vectorizer.vocabulary_)

{'british': 20, 'airways': 10, 'budget': 21, 'rival': 80, 'ryanair': 81, 'cancelled': 23, 'hundreds': 54, 'flights': 47, 'demand': 35, 'travel': 91, 'drops': 38, 'amid': 11, 'fears': 43, 'spread': 88, 'coronavirus': 28, 'ba': 14, 'cancelling': 24, '216': 3, '16': 0, '28': 5, 'march': 63, 'london': 61, 'destinations': 36, 'including': 55, 'new': 67, 'york': 96, 'italy': 57, 'france': 51, 'austria': 13, 'belgium': 15, 'germany': 52, 'ireland': 56, 'cut': 31, '25': 4, '17': 1, 'april': 12, 'tourists': 90, 'business': 22, 'people': 73, 'cutting': 32, 'foreign': 49, 'significant': 87, 'expansion': 42, 'number': 69, 'cases': 26, 'uk': 92, 'prime': 74, 'minister': 66, 'boris': 18, 'johnson': 58, 'warned': 93, 'boss': 19, 'michael': 64, 'leary': 60, 'said': 82, 'focus': 48, 'time': 89, 'minimising': 65, 'risk': 79, 'passengers': 72, 'heavily': 53, 'booked': 16, 'weeks': 94, 'notable': 68, 'drop': 37, 'forward': 50, 'bookings': 17, 'end': 40, 'early': 39, 'makes': 62, 'sense': 86, 'selectively'

In [6]:
list(vectorizer.vocabulary_.keys())[:5]

['british', 'airways', 'budget', 'rival', 'ryanair']

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(use_idf=True)
tfidf_transformer.fit(transformed)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [8]:
# you only needs to do this once, this is a mapping of index to 
feature_names=vectorizer.get_feature_names()
feature_names[10:20]

['airways',
 'amid',
 'april',
 'austria',
 'ba',
 'belgium',
 'booked',
 'bookings',
 'boris',
 'boss']

In [9]:
def sort(matrix):
    tuples = zip(matrix.col, matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [13]:
sent=sentences[0]
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(vectorizer.transform([sent]))
 
#sort the tf-idf vectors by descending order of scores
sorted_items = sort(tf_idf_vector.tocoo())
 
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)
 
# now print the results
print("sentence")
print(sent)
print("Keywords")
for k in keywords:
    print(k,keywords[k])

sentence
British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.
Keywords
spread 0.304
rival 0.304
hundreds 0.304
fears 0.304
drops 0.304
demand 0.304
budget 0.304
amid 0.304
coronavirus 0.222
cancelled 0.222
