<a href="https://colab.research.google.com/github/rohanath123/Latent-Dirichlet-Allocation/blob/master/Wiki_LDA_Rocky.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import bs4 as bs

In [0]:
import urllib.request

In [0]:
from pprint import pprint

In [0]:
import re

In [0]:
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/ASAP_Rocky').read()

In [0]:
soup = bs.BeautifulSoup(source, 'html5lib')

In [0]:
data = []
for paragraph in soup.find_all('p'):
  data.append(paragraph.text)

In [0]:
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

In [0]:
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

In [0]:
def sent_to_words(sentences):
  for sent in sentences:
    yield(gensim.utils.simple_preprocess(str(sent), deacc = True))

In [0]:
data_words = list(sent_to_words(data))

In [0]:
import spacy
import nltk
from nltk.corpus import stopwords

In [16]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
def remove_stopwords(texts):
  x = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
  return(x)

In [0]:
data_words_nonstop = remove_stopwords(data_words)

In [0]:
def lemma(texts, allowed_postags = ['ADV', 'VERB', 'NOUN', 'ADJ']):
  texts_out = []
  for sent in texts:
    doc = nlp(" ".join(sent))
    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
  return texts_out

In [20]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
nlp = spacy.load('en_core_web_sm', disable = ['parser', 'ner'])

In [0]:
bigrams = gensim.models.Phrases(data_words, min_count = 5, threshold = 100)

In [0]:
bigram_mod = gensim.models.phrases.Phraser(bigrams)

In [0]:
def make_bigrams(texts):
  return([bigram_mod[doc] for doc in texts])

In [0]:
data_words_bigrams = make_bigrams(data_words_nonstop)

In [0]:
data_lem = lemma(data_words_bigrams)

In [0]:
id2word = corpora.Dictionary(data_lem)

In [0]:
texts = data_lem

In [0]:
corpus = [id2word.doc2bow(text) for text in texts]

In [0]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, num_topics = 10, id2word = id2word, alpha = 'auto', per_word_topics=True)

In [0]:
from gensim.models import CoherenceModel

In [0]:
c = CoherenceModel(model = lda_model, corpus = corpus, dictionary= id2word, texts = data_lem)

In [40]:
c.get_coherence()

0.3580659045104916

In [42]:
scores = []
for i in range(1, 20):
  print(i)
  lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, num_topics = 2*i, id2word = id2word, alpha = 'auto', per_word_topics = True)
  c = CoherenceModel(model = lda_model, corpus = corpus, dictionary= id2word, texts = data_lem)  
  scores.append(c.get_coherence())

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [45]:
(len(scores))

19

In [0]:
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, num_topics = 15, id2word = id2word, alpha = 'symmetric', per_word_topics = True) 

In [58]:
c = CoherenceModel(model = lda_model, corpus = corpus, dictionary= id2word, texts = data_lem)  
c.get_coherence()

0.4499497290996848

In [59]:
pprint(lda_model.print_topics())

[(0,
  '0.027*"mayer" + 0.019*"rocky" + 0.013*"video" + 0.013*"music" + '
  '0.012*"record" + 0.010*"release" + 0.008*"october" + 0.007*"single" + '
  '0.007*"premiere" + 0.007*"hold"'),
 (1,
  '0.018*"mayer" + 0.010*"rocky" + 0.008*"video" + 0.008*"release" + '
  '0.008*"album" + 0.008*"instrumental" + 0.007*"sweden" + 0.007*"jafari" + '
  '0.007*"beauty" + 0.007*"swedish"'),
 (2,
  '0.037*"rocky" + 0.016*"release" + 0.012*"june" + 0.012*"single" + '
  '0.012*"yam" + 0.012*"rapper" + 0.011*"music" + 0.009*"mob" + 0.009*"late" + '
  '0.009*"deal"'),
 (3,
  '0.018*"mayer" + 0.016*"release" + 0.014*"sweden" + 0.012*"swedish" + '
  '0.012*"rocky" + 0.009*"record" + 0.008*"deal" + 0.007*"sign" + '
  '0.007*"october" + 0.006*"ground"'),
 (4,
  '0.002*"mayer" + 0.002*"video" + 0.002*"jafari" + 0.001*"bodyguard" + '
  '0.001*"see" + 0.001*"cut" + 0.001*"attack" + 0.001*"footage" + 0.001*"get" '
  '+ 0.001*"go"'),
 (5,
  '0.049*"album" + 0.023*"rocky" + 0.022*"release" + 0.016*"long" + '
  '0.