In [133]:
import re
import numpy as np
import pandas as pd
import string
import nltk
import spacy

In [89]:
import gensim

In [90]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import pdist, squareform

In [91]:
import os
import codecs


In [92]:
# nltk.download('stopwords')

# pd.get_option('display.max_colwidth') #50
# pd.set_option('display.max_colwidth',60)

### Import annual plan descriptions & the name of the institutions

In [93]:
institutions_df = pd.read_csv('institutions_table.csv')


In [151]:
plans = pd.read_csv('all_plans.csv', encoding = 'cp1252')

plans['full_text']= plans['description'] + plans['response_1'] + plans['response_2'] + plans['response_3']

In [95]:
plans.head()

Unnamed: 0,institution_id,name,proposal_id,fund_id,year_id,description,response_1,response_2,response_3,full_text
0,341,Desert,4069,3,2019,The Desert Regional Consortium has made a conc...,All AEBG allocations are approved unanimously ...,All AEBG funds from present and prior years ar...,The Desert Regional Consortium will have calen...,The Desert Regional Consortium has made a conc...
1,364,North Orange,4092,3,2019,Working together with all eight NOCRC Members ...,NOCR followed the established practices when a...,The carry-over funds will help support with th...,NOCRC will engage all eight consortium members...,Working together with all eight NOCRC Members ...
2,342,South Bay (El Camino),4070,3,2019,MISSION\nThe South Bay Adult Education Consort...,The uncertainty of the future funding makes it...,The South Bay Adult Education Consortium remai...,The SBAEC consortium has begun a strategic pla...,MISSION\nThe South Bay Adult Education Consort...
3,390,South Bay (Southwestern),4118,3,2019,Vision: The South Bay Adult Education Consorti...,"The planned allocations for CUSD, SWC, and SUH...",We will focus remaining carryover funds at the...,A consortium-wide summit was held in Spring 20...,Vision: The South Bay Adult Education Consorti...
4,374,San Bernardino,4102,3,2019,In keeping with the Collective Impact approach...,"IAEC Board Members engage in ongoing data, pro...",IAEC Member Districts participate in regularly...,The IAEC Executive Board holds regular meeting...,In keeping with the Collective Impact approach...


### Remove stop words from the descriptions and use a stemmer (eg: Snowball) to reduce words to stems 

In [157]:
stop = set(stopwords.words('english'))
new_stop = {'consortium', 'college', 'district', 'county','member', 'members', 'regions','plans', 
            'also', 'region', 'regional', 'desert','institution', 'north', 'west', 'south', 'east', 
            'valley', 'palo', 'effort','bakersfield', 'use', 'glendale', 'plan', 'would',
            'add', 'must', 'different', 'extremely','year', 'edu', 'http', 'edu', 'ne', 
            'college', 'state', 'use', 'allow', 'take', 'could', 'look', 'three', 'budget',
            'plan', 'joshua', 'tree', 'pearson', 'vue', 'west', 'end', 'corridor',
            'santa', 'clarita', 'valley','palo', 'verde','south', 'orange', 'county',
            'salano', 'december', 'stanislaus', 'counties', 'marin', 's',"'s'",'marin',
            'luis', 'obispo', 'education', 'butte'}
stop = stop.union(new_stop)

stemmer = SnowballStemmer("english")

# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [158]:
descriptions = plans['full_text']
#nltk.download('punkt')
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in descriptions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'descriptions', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)



In [159]:
#create a pandas DataFrame with the stemmed vocabulary as the index and the tokenized words as the column

vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, 
                           index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 52223 items in vocab_frame


### Use NLTK collocations package to find phrases

In [160]:
from nltk.collocations import *


token_text = [token for token in totalvocab_tokenized if token not in stop]
bigramFinder = BigramCollocationFinder.from_words(token_text)

bigrams = nltk.collocations.BigramAssocMeasures()
trigrams = nltk.collocations.TrigramAssocMeasures()

trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words(token_text)

In [161]:
bigramFinder.apply_freq_filter(3)
bigramFinder.nbest(bigrams.pmi, 10)

[('criminal', 'justice'),
 ('dsusd', 'psusd'),
 ('love', 'logic'),
 ('therapy', 'aide'),
 ('chamber', 'commerce'),
 ('mendocino', 'lake'),
 ('environmental', 'scan'),
 ('capital', 'outlay'),
 ('task', 'forces'),
 ('mother', 'lode')]

In [162]:
trigramFinder.apply_freq_filter(3)
trigramFinder.nbest(trigrams.pmi, 20)

[('long-term', 'trends', 'high-growth'),
 ('manage', 'spend-down', 'expenditure'),
 ('trends', 'high-growth', 'sectors'),
 ('innovation', 'opportunity', 'act'),
 ('foundational', 'toward', 'realization'),
 ('human', 'centered', 'design'),
 ('continuous', 'improvement', 'among'),
 ('spend-down', 'expenditure', 'reporting'),
 ('communities', 'blythe', 'needles'),
 ('serves', 'foundational', 'toward'),
 ('wioa', 'title', 'ii'),
 ('ensure', 'continuous', 'improvement'),
 ('opportunity', 'act', 'wioa'),
 ('create', 'stability', 'among'),
 ('exploration', 'job', 'search'),
 ('together', 'create', 'stability'),
 ('fund', 'due', 'expire'),
 ('analysis', 'ensure', 'continuous'),
 ('improvement', 'among', 'decisions'),
 ('decision', 'making', 'comprehensive')]

#### filter out ngrams based on word type

In [163]:
#bigrams
bigram_freq = bigramFinder.ngram_fd.items()
bigramFreqTable = pd.DataFrame(list(bigram_freq), columns=['bigram','freq']).sort_values(by='freq', ascending=False)
#trigrams
trigram_freq = trigramFinder.ngram_fd.items()
trigramFreqTable = pd.DataFrame(list(trigram_freq), columns=['trigram','freq']).sort_values(by='freq', ascending=False)

In [164]:

#function to filter for ADJ/NN bigrams and filter out stopwords
def rightTypes(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in stop or word.isspace():
            return False
    acceptable_types = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    second_type = ('NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in acceptable_types and tags[1][1] in second_type:
        return True
    else:
        return False
#filter bigrams
filtered_bi = bigramFreqTable[bigramFreqTable.bigram.map(lambda x: rightTypes(x))]
#function to filter for trigrams
def rightTypesTri(ngram):
    if '-pron-' in ngram or 't' in ngram:
        return False
    for word in ngram:
        if word in en_stopwords or word.isspace():
            return False
    first_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    third_type = ('JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS')
    tags = nltk.pos_tag(ngram)
    if tags[0][1] in first_type and tags[2][1] in third_type:
        return True
    else:
        return False
#filter trigrams
filtered_tri = trigramFreqTable[trigramFreqTable.trigram.map(lambda x: rightTypesTri(x))]

In [165]:
#filter for only those with more than 20 occurences
bigramFinder.apply_freq_filter(7)
trigramFinder.apply_freq_filter(7)

bigramPMITable = pd.DataFrame(list(bigramFinder.score_ngrams(bigrams.pmi)), 
                              columns=['bigram','PMI']).sort_values(by='PMI', ascending=False)
trigramPMITable = pd.DataFrame(list(trigramFinder.score_ngrams(trigrams.pmi)), 
                               columns=['trigram','PMI']).sort_values(by='PMI', ascending=False)

In [166]:
trigramPMITable.head()

Unnamed: 0,trigram,PMI
0,"(english, second, language)",19.080425
1,"(wioa, strong, workforce)",15.479315
2,"(planned, allocations, consistent)",15.374052
3,"(high, school, diploma)",14.908368
4,"(carry-over, funds, prior)",13.564471


In [167]:
bigramPMITable.head()

Unnamed: 0,bigram,PMI
0,"(mother, lode)",12.027215
1,"(ml, ace)",11.83457
2,"(los, angeles)",11.512642
3,"(mini, grant)",10.375138
4,"(due, expire)",10.249607


### Use SciKit-Learn's TFIDF Vectorizer to obtain a weighted list of term occurrences for each plan
Also optionally use CountVectorizer to help see what's going on

In [168]:
nlp = spacy.load("C:/Users/Sarah Robinson/Miniconda3/lib/site-packages/spacy/data/en/en_core_web_sm-2.0.0")

In [169]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    doc = nlp(" ".join(texts)) 
    texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [170]:
test_words = ['test', 'friend', 'very', 'good', 'at', 'this', 'nlp', 'shit', 
              'i', 'am', 'so', 'friggen', 'tyty']

test_lemons = lemmatization(test_words, allowed_postags = ['NOUN'])
print(test_lemons)

[['test', 'friend', 'nlp', 'shit', 'friggen']]


In [171]:
def token_stem_lem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    #filter all but noun, adjectives, and verbs
    lemmons = lemmatization(tokens, allowed_postags = ['NOUN', 'VERB','ADJ'])
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in lemmons:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
 

In [172]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, max_features=200,
                                 min_df=0.001, stop_words= stop,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,4))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions) #fit the vectorizer to descriptions

print(tfidf_matrix.shape)

  sorted(inconsistent))


Wall time: 1.56 s
(71, 200)


In [173]:
#get vocabulary list
terms = tfidf_vectorizer.get_feature_names()

### Use scikit learn to get the cosine distance between descriptions  

In [174]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [175]:
tfidf_matrix

<71x200 sparse matrix of type '<class 'numpy.float64'>'
	with 5280 stored elements in Compressed Sparse Row format>

# K-means clustering

In [176]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

Wall time: 205 ms


In [177]:
institution_ids = plans.name
plans_dict = {'institution': institution_ids, 'description': descriptions, 'cluster':clusters}
df = pd.DataFrame.from_dict(plans_dict)


In [178]:
df.head()

Unnamed: 0,institution,description,cluster
0,Desert,The Desert Regional Consortium has made a conc...,4
1,North Orange,Working together with all eight NOCRC Members ...,4
2,South Bay (El Camino),MISSION\nThe South Bay Adult Education Consort...,0
3,South Bay (Southwestern),Vision: The South Bay Adult Education Consorti...,0
4,San Bernardino,In keeping with the Collective Impact approach...,1


In [179]:
print(df['cluster'].value_counts())
df.set_index('cluster', inplace = True)

2    26
4    23
0    12
1     8
3     2
Name: cluster, dtype: int64


In [180]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :20]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d instituions:" % i, end='')
    for ids in df.loc[i]['institution']:
        print(' %s,' % ids, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: b'classes', b'sites', b'adults', b'team', b'well', b'instruction', b'building', b'activities', b'last', b'disabilities', b'help', b'exploration', b'noncredit', b'progress', b'market', b'project', b'original', b'basic', b'groups', b'consist',

Cluster 0 instituions: South Bay (El Camino), South Bay (Southwestern), Butte-Glenn, Mid Alameda County (Chabot-Las Positas), Southeast Los Angeles, College of the Canyons, San Francisco, Monterey, Southern Alameda County (Ohlone), Salinas Valley, South Bay (San Jose Evergreen), Morongo Basin,

Cluster 1 words: b'aep', b'three-year', b'fiscally', b'board', b'ensure', b'engage', b'program', b'county', b"'s", b'across', b'approved', b'academic', b'gaps', b'managed', b'comprehensive', b'noncredit', b'adults', b'make', b'approaches', b'ongoing',

Cluster 1 instituions: San Bernardino, Victor Valley, South Orange, Ventura County, Feather River, Contra Costa, Santa Monica, Barstow,

Cluster 2 words: b'county', b'