In [22]:
import matplotlib.pyplot as plt
import gensim
import numpy as np
import spacy

from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import pyLDAvis.gensim
import wikipedia
import re

import warnings
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
%matplotlib inline

In [10]:
# We'll first create a list of articles

In [25]:
taj=wikipedia.page('Taj Mahal').content
covid=wikipedia.page('COvid 19 pandemic').content
sports=wikipedia.page('Cricket').content
data=wikipedia.page('Data Science').content

In [26]:
text=[taj, covid, sports, data]

In [27]:
text

['The Taj Mahal (; lit.  \'Crown of the Palace\', [taːdʒ ˈmɛːɦ(ə)l]) is an ivory-white marble mausoleum on the southern bank of the river Yamuna  in the Indian city of Agra. It was commissioned in 1632 by the Mughal emperor Shah Jahan (reigned from 1628 to 1658) to house the tomb of his favourite wife, Mumtaz Mahal; it also houses the tomb of Shah Jahan himself. The tomb is the centrepiece of a 17-hectare (42-acre) complex, which includes a mosque and a guest house, and is set in formal gardens bounded on three sides by a crenellated wall.\nConstruction of the mausoleum was essentially completed in 1643, but work continued on other phases of the project for another 10 years. The Taj Mahal complex is believed to have been completed in its entirety in 1653 at a cost estimated at the time to be around 32 million rupees, which in 2020 would be approximately 70 billion rupees (about U.S. $916 million). The construction project employed some 20,000 artisans under the guidance of a board of a

In [28]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [32]:
def clean(sent):
    sent=decontracted(sent)
    sent=re.sub(r'https?://\S+', '', sent)
    sent=re.sub(r'[\w\.-]+@[\w\.-]+', '', sent)
    sent = re.sub("\S*\d\S*", "", sent)
    sent=re.sub(r'[^A-Za-z0-9]+', ' ', sent)
    sent=sent.lower()
    sent=sent.translate(str.maketrans('', '', '"#%&\'()*+,-./:;<=>@[\\]^_`{|}~'))
    sent = ' '.join(e for e in sent.split() if len(e)>1)

    return sent

In [71]:
cleaned_taj=clean(taj)

In [72]:
cleaned_taj

'the taj mahal lit crown of the palace ta is an ivory white marble mausoleum on the southern bank of the river yamuna in the indian city of agra it was commissioned in by the mughal emperor shah jahan reigned from to to house the tomb of his favourite wife mumtaz mahal it also houses the tomb of shah jahan himself the tomb is the centrepiece of complex which includes mosque and guest house and is set in formal gardens bounded on three sides by crenellated wall construction of the mausoleum was essentially completed in but work continued on other phases of the project for another years the taj mahal complex is believed to have been completed in its entirety in at cost estimated at the time to be around million rupees which in would be approximately billion rupees about million the construction project employed some artisans under the guidance of board of architects led by the court architect to the emperor ustad ahmad lahauri the taj mahal was designated as unesco world heritage site in

In [73]:
cleaned_covid=clean(covid)
cleaned_sports=clean(sports)
cleaned_data=clean(data)

In [74]:
# cleaned=cleaned_taj + cleaned_covid + cleaned_data + cleaned_sports

In [75]:
# cleaned

In [76]:
nlp = spacy.load("en")
doc_taj=nlp(cleaned_taj)

In [77]:
doc_taj

the taj mahal lit crown of the palace ta is an ivory white marble mausoleum on the southern bank of the river yamuna in the indian city of agra it was commissioned in by the mughal emperor shah jahan reigned from to to house the tomb of his favourite wife mumtaz mahal it also houses the tomb of shah jahan himself the tomb is the centrepiece of complex which includes mosque and guest house and is set in formal gardens bounded on three sides by crenellated wall construction of the mausoleum was essentially completed in but work continued on other phases of the project for another years the taj mahal complex is believed to have been completed in its entirety in at cost estimated at the time to be around million rupees which in would be approximately billion rupees about million the construction project employed some artisans under the guidance of board of architects led by the court architect to the emperor ustad ahmad lahauri the taj mahal was designated as unesco world heritage site in 

In [81]:
# nlp = spacy.load("en")
doc_covid=nlp(cleaned_covid)

nlp = spacy.load("en")
doc_sports=nlp(cleaned_sports)

nlp = spacy.load("en")
doc_data=nlp(cleaned_data)

In [82]:
article_taj = []
for w in doc_taj:
    # if it's not a stop word or punctuation mark, add it to our article
    if not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article_taj.append(w.lemma_)
        
article_covid = []
for w in doc_covid:
    # if it's not a stop word or punctuation mark, add it to our article
    if not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article_covid.append(w.lemma_)
        
        
article_data = []
for w in doc_data:
    # if it's not a stop word or punctuation mark, add it to our article
    if not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article_data.append(w.lemma_)
        
        
article_sports = []
for w in doc_sports:
    # if it's not a stop word or punctuation mark, add it to our article
    if not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article_sports.append(w.lemma_)

In [79]:
article_taj

['taj',
 'mahal',
 'light',
 'crown',
 'palace',
 'ta',
 'ivory',
 'white',
 'marble',
 'mausoleum',
 'southern',
 'bank',
 'river',
 'yamuna',
 'indian',
 'city',
 'agra',
 'commission',
 'mughal',
 'emperor',
 'shah',
 'jahan',
 'reign',
 'house',
 'tomb',
 'favourite',
 'wife',
 'mumtaz',
 'mahal',
 'house',
 'tomb',
 'shah',
 'jahan',
 'tomb',
 'centrepiece',
 'complex',
 'include',
 'mosque',
 'guest',
 'house',
 'set',
 'formal',
 'garden',
 'bound',
 'side',
 'crenellate',
 'wall',
 'construction',
 'mausoleum',
 'essentially',
 'complete',
 'work',
 'continue',
 'phase',
 'project',
 'year',
 'taj',
 'mahal',
 'complex',
 'believe',
 'complete',
 'entirety',
 'cost',
 'estimate',
 'time',
 'rupee',
 'approximately',
 'rupee',
 'construction',
 'project',
 'employ',
 'artisan',
 'guidance',
 'board',
 'architect',
 'lead',
 'court',
 'architect',
 'emperor',
 'ustad',
 'ahmad',
 'lahauri',
 'taj',
 'mahal',
 'designate',
 'unesco',
 'world',
 'heritage',
 'site',
 'jewel',
 'mus

In [83]:
article_covid

['pandemic',
 'know',
 'coronavirus',
 'pandemic',
 'ongoing',
 'pandemic',
 'coronavirus',
 'disease',
 'cause',
 'severe',
 'acute',
 'respiratory',
 'syndrome',
 'coronaviru',
 'disease',
 'identify',
 'december',
 'wuhan',
 'china',
 'world',
 'health',
 'organization',
 'declare',
 'outbreak',
 'public',
 'health',
 'emergency',
 'international',
 'concern',
 'january',
 'pandemic',
 'march',
 'september',
 'case',
 'report',
 'country',
 'territory',
 'result',
 'death',
 'people',
 'recover',
 'common',
 'symptom',
 'include',
 'fever',
 'cough',
 'fatigue',
 'shortness',
 'breath',
 'breathe',
 'difficulty',
 'loss',
 'smell',
 'complication',
 'include',
 'pneumonia',
 'acute',
 'respiratory',
 'distress',
 'syndrome',
 'incubation',
 'period',
 'typically',
 'day',
 'range',
 'day',
 'vaccine',
 'candidate',
 'development',
 'complete',
 'clinical',
 'trial',
 'prove',
 'safety',
 'efficacy',
 'know',
 'specific',
 'antiviral',
 'medication',
 'primary',
 'treatment',
 'curre

In [84]:
texts=[article_covid, article_data, article_sports, article_taj]

In [85]:
bigram = gensim.models.Phrases(texts)

In [86]:
texts = [bigram[line] for line in texts]

In [87]:
texts

[['pandemic',
  'know',
  'coronavirus',
  'pandemic',
  'ongoing',
  'pandemic',
  'coronavirus',
  'disease',
  'cause',
  'severe',
  'acute',
  'respiratory',
  'syndrome',
  'coronaviru',
  'disease',
  'identify',
  'december',
  'wuhan',
  'china',
  'world_health',
  'organization',
  'declare',
  'outbreak',
  'public_health',
  'emergency',
  'international',
  'concern',
  'january',
  'pandemic',
  'march',
  'september',
  'case',
  'report',
  'country',
  'territory',
  'result',
  'death',
  'people',
  'recover',
  'common',
  'symptom',
  'include',
  'fever',
  'cough',
  'fatigue',
  'shortness',
  'breath',
  'breathe',
  'difficulty',
  'loss',
  'smell',
  'complication',
  'include',
  'pneumonia',
  'acute',
  'respiratory',
  'distress',
  'syndrome',
  'incubation',
  'period',
  'typically',
  'day',
  'range',
  'day',
  'vaccine',
  'candidate',
  'development',
  'complete',
  'clinical',
  'trial',
  'prove',
  'safety',
  'efficacy',
  'know',
  'specif

In [88]:
texts[1][0:10]

['data_science',
 'inter',
 'disciplinary',
 'field',
 'use',
 'scientific',
 'method',
 'process',
 'algorithm',
 'system']

In [89]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [90]:
corpus

[[(0, 1),
  (1, 1),
  (2, 3),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 5),
  (10, 11),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 2),
  (15, 2),
  (16, 2),
  (17, 1),
  (18, 4),
  (19, 1),
  (20, 4),
  (21, 1),
  (22, 5),
  (23, 1),
  (24, 4),
  (25, 3),
  (26, 1),
  (27, 1),
  (28, 2),
  (29, 3),
  (30, 1),
  (31, 1),
  (32, 3),
  (33, 2),
  (34, 2),
  (35, 9),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 17),
  (40, 2),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 4),
  (47, 10),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 4),
  (54, 6),
  (55, 4),
  (56, 3),
  (57, 3),
  (58, 2),
  (59, 5),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 8),
  (65, 1),
  (66, 3),
  (67, 2),
  (68, 1),
  (69, 8),
  (70, 1),
  (71, 3),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 6),
  (77, 1),
  (78, 1),
  (79, 2),
  (80, 15),
  (81, 2),
  (82, 1),
  (83, 2),
  (84, 3),
  (85, 1),
  (86, 1),
  (87, 4),
  (88, 1),
  (89, 2),
  (90, 1),
  (91

In [97]:
ldamodel = LdaModel(corpus=corpus, num_topics=4, id2word=dictionary, passes=20)

In [98]:
topics=ldamodel.print_topics(num_words=10)
topics

[(0,
  '0.018*"datum" + 0.012*"data_science" + 0.010*"statistic" + 0.010*"data" + 0.010*"datum_science" + 0.007*"field" + 0.005*"analysis" + 0.005*"business" + 0.005*"science" + 0.005*"machine_learning"'),
 (1,
  '0.013*"taj_mahal" + 0.011*"tomb" + 0.008*"building" + 0.008*"garden" + 0.007*"shah_jahan" + 0.006*"design" + 0.006*"marble" + 0.006*"mughal" + 0.005*"complex" + 0.005*"dome"'),
 (2,
  '0.011*"cricket" + 0.008*"case" + 0.006*"country" + 0.006*"report" + 0.006*"batsman" + 0.005*"ball" + 0.005*"match" + 0.005*"people" + 0.005*"march" + 0.005*"outbreak"'),
 (3,
  '0.000*"cricket" + 0.000*"play" + 0.000*"case" + 0.000*"batsman" + 0.000*"match" + 0.000*"ball" + 0.000*"team" + 0.000*"include" + 0.000*"wicket" + 0.000*"bowler"')]

In [115]:
sent='pretty Great Structures were built in the times of Mughals. They were very beautiful'
cleaned_sent=clean(sent)
doc_sent=nlp(cleaned_sent)
article_sent = []
for w in doc_sent:
    # if it's not a stop word or punctuation mark, add it to our article
    if not w.is_stop and not w.is_punct and not w.like_num:
        # we add the lematized version of the word
        article_sent.append(w.lemma_)
text_sent=article_sent

In [116]:
# text_sent = [bigram[line] for line in text_sent]
text_sent=dictionary.doc2bow(text_sent)

In [117]:
text_sent

[(214, 1), (796, 1), (1911, 1), (3056, 1), (3559, 1)]

In [118]:
ldamodel.get_document_topics(text_sent)

[(0, 0.04189828), (1, 0.8679399), (2, 0.04843729), (3, 0.04172455)]