In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sataylor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sataylor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

In [5]:
import random
text_data = []
with open('data/jobs/topic_modeling/all_course_descriptions.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        text_data.append(tokens)

In [6]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import pickle
pickle.dump(corpus, open('data/jobs/topic_modeling/corpus.pkl', 'wb'))
dictionary.save('data/jobs/topic_modeling/dictionary.gensim')

In [7]:
import gensim
NUM_TOPICS = 18
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.048*"design" + 0.035*"specialization" + 0.026*"project" + 0.011*"idea" + 0.009*"creative"')
(1, '0.029*"module" + 0.022*"material" + 0.021*"complete" + 0.019*"credit" + 0.017*"academic"')
(2, '0.022*"image" + 0.019*"color" + 0.018*"photo" + 0.016*"photoshop" + 0.011*"adobe"')
(3, '0.019*"institution" + 0.016*"cours" + 0.008*"overview" + 0.007*"assumption" + 0.006*"annual"')
(4, '0.010*"patient" + 0.007*"azure" + 0.007*"paris" + 0.006*"russian" + 0.005*"governance"')
(5, '0.023*"health" + 0.017*"global" + 0.014*"change" + 0.013*"international" + 0.012*"cultural"')
(6, '0.029*"business" + 0.019*"financial" + 0.018*"social" + 0.017*"marketing" + 0.014*"market"')
(7, '0.016*"programming" + 0.012*"application" + 0.012*"design" + 0.012*"build" + 0.011*"using"')
(8, '0.025*"sale" + 0.021*"product" + 0.021*"customer" + 0.013*"process" + 0.013*"service"')
(9, '0.019*"effect" + 0.014*"motion" + 0.012*"animation" + 0.011*"material" + 0.009*"technique"')
(10, '0.034*"system" + 0.020*"securi