In [132]:
import re
import numpy as np
import pandas as pd

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


In [78]:
#read annual plans
#plans_og = pd.read_json('annualPlans1.json')

#read in proposals, will train model on this larger set
proposals = pd.read_csv("proposals.csv", encoding = 'cp1252')

#will run it later on this target set
plans = pd.read_csv("all_plans.csv",  encoding = 'cp1252')


In [90]:
len(proposals)

4695

In [79]:
p_long = proposals.melt()
p_long.columns = ['type', 'text']


#will use later
plans['full_text']= plans['description'] + plans['response_1'] + plans['response_2'] + plans['response_3']
plan_text = plans['full_text']

In [82]:
len(p_long)

23475

### clean up text with simple_preprocess

In [83]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(p_long.text))
data_test = list(sent_to_words(plan_text))
print(data_words[:1])

[['this', 'regional', 'collaborative', 'proposes', 'an', 'evidence', 'based', 'system', 'for', 'increasing', 'stem', 'and', 'cte', 'teacher', 'preparation', 'to', 'meet', 'the', 'in', 'demand', 'jobs', 'and', 'address', 'shortages', 'in', 'the', 'field', 'of', 'education']]


### create bigrams and trigrams

In [84]:
# # Build the bigram and trigram models
# bigram = gensim.models.Phrases(data_words, min_count=3, threshold=50) # higher threshold fewer phrases.
# trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

# # See trigram example
# #print(trigram_mod[bigram_mod[data_words[0]]])

In [85]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
new_stop = {'consortium', 'college', 'district', 'county','member', 
            'members', 'regions','plans', 'also', 'region', 'regional', 'desert',
             'institution', 'north', 'west', 'south', 'east', 'valley', 'palo', 
            'effort', 'bakersfield', 'use', 'glendale', 'plan', 'would', 'add',
            'must', 'different', 'extremely', 'year', 'edu', 'http', 'edu', 'ne',
            'college', 'state', 'use', 'allow', 'take', 'could', 'look', 'consortium', 'college', 
            'district', 'county','member', 'members', 'regions','plans', 
            'also', 'region', 'regional', 'desert','institution', 'north', 'west', 'south', 'east', 
            'valley', 'palo', 'effort','bakersfield', 'use', 'glendale', 'plan', 'would',
            'add', 'must', 'different', 'extremely','year', 'edu', 'http', 'edu', 'ne', 
            'college', 'state', 'use', 'allow', 'take', 'could', 'look',
            'plan', 'joshua', 'tree', 'pearson', 'vue', 'west', 'end', 'corridor',
            'santa', 'clarita', 'valley','palo', 'verde','south', 'orange', 'county',
            'salano', 'december', 'stanislaus', 'counties', 'marin', 's',"'s'",'marin',
            'luis', 'obispo', 'education', 'butte'}

stop_words = stop_words.union(new_stop)


In [86]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [87]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
data_test_nostops = remove_stopwords(data_test)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_test_bigrams = make_bigrams(data_test_nostops)

In [88]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("C:/Users/Sarah Robinson/Miniconda3/lib/site-packages/spacy/data/en/en_core_web_sm-2.0.0") #didn't work... wouldn't load?

#trying nltk tagger instead
import nltk


In [89]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, 
                                allowed_postags=['NOUN','ADJ', 'VERB'])

test_lemmatized = lemmatization(data_test_bigrams, allowed_postags = ['NOUN', 'ADJ','VERB'])
print(data_lemmatized[:2])

[['collaborative', 'propos', 'evidence_based', 'system', 'increase', 'stem', 'cte', 'teacher_preparation', 'meet', 'demand', 'job', 'address', 'shortage', 'field'], ['proposal', 'create', 'dynamic', 'entrepreneurial', 'certificate', 'program', 'partnership', 'entrepreneurial', 'organization', 'partner', 'college', 'accord', 'northern_california', 'small_business', 'development', 'center', 'small_business', 'start_up', 'area', 'placer', 'expect', 'increase', 'period', 'exist', 'small_business', 'project', 'expand', 'need', 'qualified_employee', 'period', 'accord', 'burning_glass', 'labor_market', 'study', 'demand', 'student', 'small_business', 'management', 'skill', 'increase', 'placer', 'next_year', 'period', 'minority', 'business', 'expect', 'increase', 'placer', 'proposal', 'prepare', 'student', 'business', 'development', 'opportunity']]


### create the dictionary and corpus needed for topic modeling

In [91]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
i2w_test = corpora.Dictionary(test_lemmatized)

# Create Corpus
texts = data_lemmatized
test_texts = test_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
test_corpus = [i2w_test.doc2bow(test) for test in test_texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]]


In [92]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('address', 1),
  ('collaborative', 1),
  ('cte', 1),
  ('demand', 1),
  ('evidence_based', 1),
  ('field', 1),
  ('increase', 1),
  ('job', 1),
  ('meet', 1),
  ('propos', 1),
  ('shortage', 1),
  ('stem', 1),
  ('system', 1),
  ('teacher_preparation', 1)]]

### Build LDA model

In [93]:
len(corpus)

23475

In [116]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=25, 
                                           random_state=100,
                                           update_every=2,
                                           chunksize=100,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


In [117]:
from gensim.test.utils import datapath

#save model to disk
model_file = datapath("model")
lda_model.save(model_file)

In [118]:
# Print the Keyword in the 20 topics
print(lda_model.print_topics())

[(22, '0.010*"transform_communitie" + 0.008*"peralta" + 0.008*"sophisticated" + 0.007*"instrumental" + 0.007*"readily_available" + 0.006*"registration" + 0.006*"dept" + 0.005*"engine" + 0.005*"hartnell" + 0.005*"would"'), (24, '0.026*"marketing" + 0.024*"cccco" + 0.024*"priority_sector" + 0.018*"outreach" + 0.017*"brand" + 0.014*"campaign" + 0.011*"element" + 0.010*"crc" + 0.010*"deploy" + 0.010*"canada"'), (8, '0.022*"care" + 0.017*"mental_health" + 0.011*"projection" + 0.010*"tactic" + 0.010*"competition" + 0.009*"ineffective" + 0.009*"medium" + 0.008*"drive" + 0.007*"vacancy" + 0.007*"specialty"'), (20, '0.017*"english" + 0.015*"biotechnology" + 0.013*"hospitality" + 0.010*"agriculture_water" + 0.010*"pathways_trust" + 0.009*"accreditation" + 0.008*"multiple_measure" + 0.008*"distribute" + 0.007*"cooperative" + 0.007*"diversify"'), (5, '0.037*"cluster" + 0.033*"list" + 0.032*"automotive" + 0.016*"vehicle" + 0.014*"control" + 0.010*"search" + 0.009*"mechanism" + 0.008*"san_bernardino

## evaluate: coherence score and visuals

In [120]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -7.899176897818209


In [121]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.46251934446489257


### vizualize the topics

In [137]:
# Visualize the topics
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os 
pyLDAvis.enable_notebook()
pd.options.display.max_colwidth = 10000
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds = 'tsne' )
pyLDAvis.display(vis)
pyLDAvis.save_html(vis, 'lda_vis.html')
# lda_html = pyLDAvis.prepared_data_to_html(vis, template_type='notebook')
