In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rupamacharyya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rupamacharyya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [5]:
import random
text_data = []
with open('dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['physically', 'base', 'approach', 'shape', 'blending']
['incremental', 'maintenance', 'distributive', 'aggregate', 'function']
['asynchronous', 'spike', 'event', 'coding', 'scheme', 'programmable', 'analog', 'array']
['voltage', 'syllabic', 'companding', 'domain', 'filter']
['reduce', 'complexity', 'space', 'frequency', 'model', 'multi', 'channel', 'application']
['cognitive', 'mobile', 'virtual', 'network', 'operator', 'investment', 'pricing', 'supply', 'uncertainty']
['predict', 'click', 'estimate', 'click']
['supporting', 'list', 'model', 'timely', 'approach']
['programming']
['resilient', 'right', 'protection', 'sensor', 'stream']
['novel', 'hybrid', 'neuro', 'wavelet', 'system', 'robust', 'speech', 'recognition']
['image', 'sensor', 'using', 'variable', 'reference', 'domain', 'encoding']
['power', 'minimization', '433-mhz', 'implantable', 'neural', 'recording', 'system']
['distribute', 'exponentially', 'weight', 'split']
['towards', 'legged']
['maximum', 'coverage', 'minimum', 'm

In [7]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [10]:
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1), (8, 1), (9, 1)], [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)], [(18, 1), (19, 1), (20, 1), (21, 1), (22, 1)], [(23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)], [(31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)], [(40, 2), (41, 1), (42, 1)], [(0, 1), (27, 1), (43, 1), (44, 1), (45, 1)], [(46, 1)], [(47, 1), (48, 1), (49, 1), (50, 1), (51, 1)], [(52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1)], [(19, 1), (50, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1)], [(58, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1)], [(71, 1), (72, 1), (73, 1), (74, 1)], [(75, 1), (76, 1)], [(19, 1), (28, 1), (34, 1), (77, 1), (78, 1), (79, 1)], [(51, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1)], [(69, 1), (86, 1), (87, 1), (88, 1)], [(89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1)], [(28, 1), (93, 1),