# importing Data

In [2]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import gensim
from gensim.utils import simple_preprocess
from nltk import PorterStemmer
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import re
from pprint import pprint
from gensim import corpora, models
import nltk
import numpy as np
stemmer=SnowballStemmer("english")
np.random.seed(2018)
data=pd.read_csv("abcnews-date-text.csv")
data.info()
data.head(5)




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226258 entries, 0 to 1226257
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1226258 non-null  int64 
 1   headline_text  1226258 non-null  object
dtypes: int64(1), object(1)
memory usage: 18.7+ MB


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# pre processing functions


In [3]:

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3:
            result.append(lemmatize_stemming(token))
    return result



In [4]:
ready_data=data["headline_text"].map(preprocess)
ready_data

0                 [aba, decid, communiti, broadcast, licenc]
1                                    [act, wit, awar, defam]
2                     [call, infrastructur, protect, summit]
3                      [air, staff, aust, strike, pay, rise]
4                  [air, strike, affect, australian, travel]
                                 ...                        
1226253                     [abc, reader, learn, look, year]
1226254                     [south, african, variant, covid]
1226255    [victoria, coronavirus, restrict, mean, new, y...
1226256          [what, life, like, american, doctor, covid]
1226257    [women, shed, canberra, reskil, unemploy, pandem]
Name: headline_text, Length: 1226258, dtype: object

# dictionary

In [5]:
dictionary = gensim.corpora.Dictionary(ready_data)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [6]:
bow_corpus = [dictionary.doc2bow(doc) for doc in ready_data]


# lda model training without tfi-df algorithim

In [14]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [15]:
for idx, topic in lda_model.print_topics(num_words=4):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.041*"australia" + 0.020*"day" + 0.019*"win" + 0.018*"world"
Topic: 1 
Words: 0.042*"trump" + 0.025*"china" + 0.023*"donald" + 0.022*"south"
Topic: 2 
Words: 0.023*"tasmania" + 0.016*"minist" + 0.015*"say" + 0.014*"drum"
Topic: 3 
Words: 0.033*"case" + 0.024*"court" + 0.018*"face" + 0.017*"afl"
Topic: 4 
Words: 0.079*"coronavirus" + 0.035*"covid" + 0.029*"queensland" + 0.028*"nsw"
Topic: 5 
Words: 0.037*"say" + 0.034*"elect" + 0.019*"adelaid" + 0.013*"abc"
Topic: 6 
Words: 0.045*"polic" + 0.036*"man" + 0.030*"sydney" + 0.024*"death"
Topic: 7 
Words: 0.018*"school" + 0.014*"plan" + 0.013*"new" + 0.012*"fund"
Topic: 8 
Words: 0.021*"market" + 0.015*"rise" + 0.014*"chang" + 0.014*"record"
Topic: 9 
Words: 0.041*"govern" + 0.021*"restrict" + 0.015*"protest" + 0.015*"power"


# lda model training with tfi-df

In [16]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break


[(0, 0.5961221629567331),
 (1, 0.4691066873890528),
 (2, 0.31151361288073415),
 (3, 0.4021230894767581),
 (4, 0.4072266845116059)]


In [17]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(num_words=4):
    print('Topic: {} Word: {}'.format(idx, topic))
    

Topic: 0 Word: 0.028*"trump" + 0.016*"donald" + 0.009*"world" + 0.009*"morrison"
Topic: 1 Word: 0.016*"countri" + 0.012*"bushfir" + 0.011*"coast" + 0.011*"hour"
Topic: 2 Word: 0.023*"news" + 0.013*"rural" + 0.012*"abc" + 0.009*"friday"
Topic: 3 Word: 0.012*"elect" + 0.007*"royal" + 0.007*"stori" + 0.007*"commiss"
Topic: 4 Word: 0.012*"live" + 0.011*"coronavirus" + 0.008*"australia" + 0.007*"updat"
Topic: 5 Word: 0.025*"coronavirus" + 0.017*"covid" + 0.012*"govern" + 0.006*"new"
Topic: 6 Word: 0.019*"man" + 0.017*"polic" + 0.016*"charg" + 0.013*"murder"
Topic: 7 Word: 0.015*"drum" + 0.009*"tuesday" + 0.008*"turnbul" + 0.007*"korea"
Topic: 8 Word: 0.012*"restrict" + 0.010*"scott" + 0.010*"west" + 0.008*"coronavirus"
Topic: 9 Word: 0.016*"interview" + 0.012*"sentenc" + 0.010*"christma" + 0.008*"sexual"


# testing lda model

In [18]:
testing = ['The Indian women’s hockey team have scripted history! They have booked their spot in the semifinals of the Olympics for the very first time after beating Australia 1-0 in the quarterfinals. Rani Rampal and her team will face Argentina in the semis who beat Germany 3-0 in the previous quarterfinals match.',
          'Christianity began in the 1st century AD after Jesus died and was claimed to be resurrected. Starting as a small group of Jewish people in Judea, it spread quickly throughout the Roman Empire. Despite early persecution of Christians, it later became the state religion. In the Middle Ages it spread into Northern Europe and Russia. During the Age of Exploration, Christianity expanded throughout the world; it is currently the largest religion of the world.',
          'Oil prices fell on Monday as worries over Chinas economy resurfaced after a survey showing growth in factory activity slipped sharply in the worlds second-largest oil consumer, with concerns compounded by higher crude output from OPEC producers. Brent crude oil futures slid by 79 cents, or 1.06%, to $74.62 a barrel by 0945 GMT, having earlier touched a low of $74.10. U.S. West Texas Intermediate (WTI) crude futures dropped 88 cents, or 1.2%, to $73.07 after slipping to a session low of $72.77. China has been leading economic recovery in Asia and if the pullback deepens, concerns will grow that the global outlook will see a significant decline, said Edward Moya, senior analyst at OANDA. Chinas factory activity growth slipped sharply in July as demand contracted for the first time in more than a year, a survey showed on Monday.',
          'President Biden on Monday praised King Abdullah II of Jordan as a loyal and decent friend as the two leaders met at the White House, a critical visit for a Mideast leader who found himself side lined under former President Donald J. Trump. King Abdullah is the first Arab head of state to visit the White House since Mr. Biden took office, a sign that the United States wants to elevate Jordan once again to its traditional role as a regional peacemaker. We’ve been hanging out together for a long time, Mr. Biden said Monday, as the two sat in the Oval Office ahead of their bilateral meeting. It’s good to have him back in the White House.” Although Mr. Biden’s foreign policy priorities are heavily focused on China and Russia, the Middle East is a region that the new administration quickly learned it cannot afford to ignore',
          'Windows 11 has got its first beta for members of the Windows Insider Program. The new OS from Microsoft was announced last month after which the developer preview was released. Microsoft says this beta build is part of the beta cannel and it brings a host of features and improvements to the OS. While it is more stable than the previous developer preview, it still has several bugs so keep that in mind if you plan on using it on your primary PC.',
          'Gross domestic product grew at a 6.5% annual rate in the second quarter and the economy exceeded its pre-pandemic size, but slower growth is expected in coming months.',
          'Different religions god may or may not contain various elements ranging from the divine,[4] sacred things,[5] faith,[6] a supernatural being or supernatural beings[7] or "some sort of ultimacy and transcendence that will provide norms and power for the rest of life".',
          'A variety of methods are deployed in politics, which include promoting ones own political views among people, negotiation with other political subjects, making laws, and exercising force, including warfare against adversaries',
          'Mutaz Essa Barshim from Qatar and Gianmarco Tamberi from Italy were the last men standing in the final of the mens high jump event on Sunday. Both had successfully cleared the 2.37 metres mark and both also couldnt clear 2.39 metres, using up all three attempts. Which served up a conundrum. Who wins? Officials offered Barshim and Tamberi two options. They could take part in jump-off, to decide a winner, or they could share the gold medal.',
          'The latest Android 12 beta is here. This is an exclusive first look at Googles all-new design with big buttons, new widgets, and more — plus all the new features including improved privacy options and an Android TV remote. Dieter Bohn walks through everything you need to know about Android 12 from Google I/O 2021.']

In [22]:
bow_vector = dictionary.doc2bow(preprocess(testing[1]))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.20355068147182465	 Topic: 0.079*"coronavirus" + 0.035*"covid" + 0.029*"queensland" + 0.028*"nsw" + 0.023*"victoria"
Score: 0.1697162240743637	 Topic: 0.041*"govern" + 0.021*"restrict" + 0.015*"protest" + 0.015*"power" + 0.014*"andrew"
Score: 0.16728363931179047	 Topic: 0.021*"market" + 0.015*"rise" + 0.014*"chang" + 0.014*"record" + 0.014*"australian"
Score: 0.1416424810886383	 Topic: 0.041*"australia" + 0.020*"day" + 0.019*"win" + 0.018*"world" + 0.018*"test"
Score: 0.09540995955467224	 Topic: 0.033*"case" + 0.024*"court" + 0.018*"face" + 0.017*"afl" + 0.016*"trial"
Score: 0.08948663622140884	 Topic: 0.042*"trump" + 0.025*"china" + 0.023*"donald" + 0.022*"south" + 0.019*"australia"
Score: 0.0662970095872879	 Topic: 0.037*"say" + 0.034*"elect" + 0.019*"adelaid" + 0.013*"abc" + 0.012*"tasmanian"
Score: 0.03331269323825836	 Topic: 0.023*"tasmania" + 0.016*"minist" + 0.015*"say" + 0.014*"drum" + 0.013*"amid"
Score: 0.030594298616051674	 Topic: 0.045*"polic" + 0.036*"man" + 0.030*

# testing lda model with tfi-df

In [23]:
bow_vector = dictionary.doc2bow(preprocess(testing[0]))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.6698287725448608	 Topic: 0.028*"trump" + 0.016*"donald" + 0.009*"world" + 0.009*"morrison" + 0.009*"cup"
Score: 0.12359201163053513	 Topic: 0.012*"live" + 0.011*"coronavirus" + 0.008*"australia" + 0.007*"updat" + 0.006*"day"
Score: 0.11957252770662308	 Topic: 0.015*"drum" + 0.009*"tuesday" + 0.008*"turnbul" + 0.007*"korea" + 0.007*"south"
Score: 0.058399710804224014	 Topic: 0.025*"coronavirus" + 0.017*"covid" + 0.012*"govern" + 0.006*"new" + 0.006*"nsw"
