In [1]:
import spacy
#spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Joshf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Joshf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [5]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['privacy', 'nudge', 'social', 'medium', 'exploratory', 'facebook', 'study']
['characterize', 'geospatial', 'dynamics', 'application', 'usage', 'cellular', 'network']
['empirical', 'comparison', 'database', 'concurrency', 'scheme']
['unify', 'energy', 'efficient', 'route', 'multi', 'wireless', 'network']
['reduction', 'resonant', 'clock', 'distribution', 'network']
['broadband', 'beamfoming', 'using', 'nest', 'planar', 'array', 'frustum', 'filter']
['efficient', 'skyline', 'query', 'variable', 'preference', 'nominal', 'attribute']
['search', 'event', 'blogosphere']
['enloc', 'energy', 'efficient', 'localization', 'mobile', 'phone']
['impact', 'power', 'control', 'performance', 'wireless', 'network']
['speed', 'random', 'number', 'generator', 'base', 'phase', 'noise', 'oscillator']
['universal', 'embed', 'compression', 'engine', 'system', 'expansion', 'progressive', 'wavelet', 'coding']
['experimental', 'result', 'wideband', 'spectrum', 'sensing', 'using', 'random', 'sampling']
['improv

In [6]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')



In [7]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.047*"efficient" + 0.026*"query" + 0.026*"energy" + 0.026*"noise"')
(1, '0.034*"database" + 0.034*"design" + 0.019*"wavelet" + 0.019*"compression"')
(2, '0.039*"scalable" + 0.022*"modulation" + 0.022*"experimental" + 0.022*"spectrum"')
(3, '0.042*"using" + 0.023*"algorithm" + 0.023*"network" + 0.023*"filter"')
(4, '0.049*"system" + 0.033*"network" + 0.033*"wireless" + 0.033*"multi"')


In [8]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(84, 1)]
[(0, 0.10002575), (1, 0.103318416), (2, 0.100021556), (3, 0.596616), (4, 0.100018255)]


In [9]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.046*"efficient" + 0.036*"using" + 0.025*"wideband" + 0.025*"localization"')
(1, '0.032*"network" + 0.031*"wireless" + 0.023*"system" + 0.023*"scalable"')
(2, '0.015*"multi" + 0.015*"application" + 0.015*"modulation" + 0.015*"noise"')


In [10]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.033*"noise" + 0.033*"base" + 0.033*"empirical" + 0.033*"comparison"')
(1, '0.045*"system" + 0.045*"application" + 0.023*"coding" + 0.023*"modal"')
(2, '0.061*"database" + 0.061*"design" + 0.061*"toolkit" + 0.061*"facekit"')
(3, '0.081*"scalable" + 0.042*"modulation" + 0.042*"enloc" + 0.042*"mobile"')
(4, '0.008*"network" + 0.008*"database" + 0.008*"algorithm" + 0.008*"efficient"')
(5, '0.055*"network" + 0.055*"efficient" + 0.055*"wireless" + 0.055*"multi"')
(6, '0.030*"control" + 0.030*"power" + 0.030*"exploratory" + 0.030*"impact"')
(7, '0.034*"using" + 0.034*"system" + 0.034*"query" + 0.034*"filter"')
(8, '0.042*"using" + 0.042*"algorithm" + 0.042*"random" + 0.042*"result"')
(9, '0.032*"reduction" + 0.032*"distribution" + 0.032*"clock" + 0.032*"resonant"')


In [11]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  """
  """
  """
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [12]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [13]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
