In [1]:
import nltk

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [5]:
data = [
        "Rahul Sharad Dravid born 11 January 1973) is a former Indian cricketer and captain of the Indian national team, currently serving as its head coach. Prior to his appointment to the senior men's national team, Dravid was the Head of Cricket at the National Cricket Academy (NCA)",
        "Under his tutelage, the under-19 team finished runners up at the 2016 U-19 Cricket World Cup and won the 2018 U-19 Cricket World Cup. Known for his sound batting technique",
        "Born in a Marathi family and raised in Bangalore, he started playing cricket at the age of 12 and later represented Karnataka at the under-15, under-17 and under-19 levels",
        "Hailed as The Wall, Dravid was named one of the best five cricketers of the year by Wisden Cricketers' Almanack in 2000 and received the Player of the Year and the Test Player of the Year awards at the inaugural ICC awards ceremony in 2004",
        "As of December 2016, Dravid is the fourth-highest run scorer in Test cricket, after Sachin Tendulkar, Ricky Ponting and Jacques Kallis"
]

In [6]:
from nltk.corpus import stopwords

In [7]:
import spacy

In [8]:
stop_words = stopwords.words('english')

In [9]:
import re

In [10]:
data = [re.sub("\'", "", sent) for sent in data]

In [11]:
def sent_to_words(texts):
  for sentence in texts:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [12]:
data_words = list(sent_to_words(data))

In [14]:
data_words[:1]

[['rahul',
  'sharad',
  'dravid',
  'born',
  'january',
  'is',
  'former',
  'indian',
  'cricketer',
  'and',
  'captain',
  'of',
  'the',
  'indian',
  'national',
  'team',
  'currently',
  'serving',
  'as',
  'its',
  'head',
  'coach',
  'prior',
  'to',
  'his',
  'appointment',
  'to',
  'the',
  'senior',
  'mens',
  'national',
  'team',
  'dravid',
  'was',
  'the',
  'head',
  'of',
  'cricket',
  'at',
  'the',
  'national',
  'cricket',
  'academy',
  'nca']]

In [15]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)



In [16]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [23]:
def remove_stop_words(texts):
  return [[word for word in simple_preprocess(str(doc)) if word not in stop_words]for doc in texts]

In [18]:
def make_bigrams(texts):
  return [bigram[doc] for doc in texts]

In [19]:
def make_trigrams(texts):
  return [trigram[bigram[doc]] for doc in texts]

In [20]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
  texts_out = []
  for sentence in texts:
    doc = nlp(" ".join(sentence))
    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
  return texts_out

In [21]:
print(trigram_mod[bigram[data_words[0]]])

['rahul', 'sharad', 'dravid', 'born', 'january', 'is', 'former', 'indian', 'cricketer', 'and', 'captain', 'of', 'the', 'indian', 'national', 'team', 'currently', 'serving', 'as', 'its', 'head', 'coach', 'prior', 'to', 'his', 'appointment', 'to', 'the', 'senior', 'mens', 'national', 'team', 'dravid', 'was', 'the', 'head', 'of', 'cricket', 'at', 'the', 'national', 'cricket', 'academy', 'nca']




In [24]:
data_words_no_stop_words = remove_stop_words(data_words)

In [25]:
data_words_bigrams = make_bigrams(data_words_no_stop_words)



In [26]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [27]:
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ'])

In [28]:
id2word = corpora.Dictionary(data_lemmatized)

In [29]:
texts = data_lemmatized

In [30]:
corpus = [id2word.doc2bow(doc) for doc in data_lemmatized]

In [31]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, chunksize=100, passes=10, alpha='auto', per_word_topics=True, update_every=1)

In [32]:
lda_model.print_topics()

[(0,
  '0.134*"run" + 0.134*"high" + 0.134*"fourth" + 0.134*"scorer" + 0.134*"test" + 0.012*"team" + 0.012*"year" + 0.012*"sound" + 0.012*"award" + 0.012*"marathi"'),
 (1,
  '0.031*"team" + 0.031*"test" + 0.031*"tutelage" + 0.031*"family" + 0.031*"level" + 0.031*"award" + 0.031*"year" + 0.031*"run" + 0.031*"high" + 0.031*"cricket"'),
 (2,
  '0.031*"team" + 0.031*"level" + 0.031*"test" + 0.031*"marathi" + 0.031*"age" + 0.031*"cricket" + 0.031*"family" + 0.031*"tutelage" + 0.031*"high" + 0.031*"fourth"'),
 (3,
  '0.031*"team" + 0.031*"test" + 0.031*"runner" + 0.031*"cricket" + 0.031*"marathi" + 0.031*"level" + 0.031*"age" + 0.031*"run" + 0.031*"fourth" + 0.031*"award"'),
 (4,
  '0.134*"age" + 0.134*"cricket" + 0.134*"marathi" + 0.134*"level" + 0.134*"family" + 0.012*"team" + 0.012*"test" + 0.012*"cricketer" + 0.012*"runner" + 0.012*"scorer"'),
 (5,
  '0.159*"team" + 0.083*"coach" + 0.083*"appointment" + 0.083*"former" + 0.083*"head" + 0.083*"indian" + 0.083*"men" + 0.083*"national" + 0.0

In [34]:
lda_model.log_perplexity(corpus)

-4.210741095989943

In [35]:
coherence_model = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')

In [36]:
coherence_model.get_coherence()

0.4847760273631875

In [41]:
mallet_path = "mallet"

In [44]:
def compute_coherence(dictionary, corpus, texts, limit, start=2, step=3):
  coherence_values = []
  model_list = []
  for num_topics in range(start, limit, step):
    model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word)
    model_list.append(model)
    coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherence_model.get_coherence())

  return model_list, coherence_values

In [46]:
model_list, coherence_values = compute_coherence(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [47]:
coherence_values

[0.40925728895845725,
 0.48129684642084186,
 0.49780220405308945,
 0.4194268993408129,
 0.4191061264721526,
 0.4198493576617343,
 0.42035788426513243]