# Topic Modelling (LDA)
https://stackoverflow.com/questions/20349958/understanding-lda-implementation-using-gensim - tutorial
https://stackoverflow.com/questions/19197757/remove-words-lines-between-matching-delimeters - regular expressions

In [2]:
import os
import re
from gensim.utils import smart_open, simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords
from gensim.corpora import Dictionary
import gensim
import glob
import nltk
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/arisilburt/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
dir = 'readme_out'
Documents = [x.strip() for x in open('%s/CORPUS.txt'%dir, 'r').readlines()]

# stem words
# sno = nltk.stem.SnowballStemmer('english')
# for i in range(len(Documents)):
#     text = Documents[i].split()
#     for j in range(len(text)):
#         text[j] = sno.stem(text[j])
#     Documents[i] = ' '.join(e for e in text)

vectorizer = CountVectorizer(min_df = 10)
A = vectorizer.fit_transform(Documents)
terms = vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 2830 distinct terms


In [88]:
Documents[100]

'energy load time license open source python library analyze plot energy related kind data strong focus related energy electricity heat demand included carefully selected context generalized working different aim provide higher level commonly scientific analysis energy load statistical analysis convenience wrapper common statistical include analysis overview descriptive statistics reshape load duration curve extract daily plot plot generate generate daily monthly generate sinusoidal function sample given load duration curve given add noise noise correlated load fit analytical load duration statistics feature extraction quick overview load curve useful coupled machine learning library regression prediction numerous relevant around run following explore code import import load random rand create random vector convenience wrapper mean load duration nice useful dispatch daily notebook documentation construction overview available latest stable version install pip install aware library conc

In [90]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=3, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [91]:
lda.fit(A)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_components=3, n_jobs=1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [92]:
def print_top_words(model, feature_names, n_top_words=20):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
print("\nTopics in LDA model:")
print_top_words(lda, vectorizer.get_feature_names())


Topics in LDA model:
Topic #0: data function use number value file type set example default output new object time input size array graph color return
Topic #1: learning model data machine training deep neural network python train use test implementation loss image linear classification feature regression analysis
Topic #2: install python use run data code project build file version available axe image need package source following create pip library



# Sklearn 
https://github.com/derekgreene/topic-model-tutorial/blob/master/1%20-%20Text%20Preprocessing.ipynb

In [61]:
EngWords = set(nltk.corpus.words.words())

def clean_readme(readme):
    preproc_text = open(readme, 'r', encoding='utf-8').read()
    preproc_text.replace('\n', '')
    preproc_text = re.sub(r'<.*?>\s*', '', preproc_text, flags=re.DOTALL)
    preproc_text = re.sub(r'\[.*?\]\s*', '', preproc_text, flags=re.DOTALL)
    preproc_text = re.sub(r'\(.*?\)\s*', '', preproc_text, flags=re.DOTALL)
    text = re.sub('[^A-Za-z0-9 /.]+', '', preproc_text).lower()
    cleaned_text = [token for token in simple_preprocess(remove_stopwords(text)) if len(token) > 2 and token in EngWords]
    return " ".join(cleaned_text)

def create_corpus(dir, topics):
    f = open('%sCORPUS.txt'%dir, 'w')
    Corpus = []
    for t in topics:
        path = '%s%s'%(dir, t)
        readmes = glob.glob('%s/*.txt'%path)
        for r in readmes:
            try:
                Corpus.append(clean_readme(r))
            except:
                print('couldnt process %s'%r)

    for doc in Corpus:
        f.write(doc)
        f.write('\n')
    f.close()
    return Corpus

In [93]:
dir = 'readme_out/'
#topics = ['visualization', 'statistics', 'programming', 'machine-learning']
topics = ['visualization', 'statistics', 'machine-learning']
Documents = create_corpus(dir, topics)

In [63]:
Documents[11]

'multiple view interface proof concept prototype personal agency brushing linking giving people flexibility configure source link target multiple brushes information tutorial video supplemental getting try local dont install node install run install root directory project serve index file run application open browser way install install simple command line root directory project tell address open application browser tested chrome browser touch disable swipe browser history disable history navigation chrome'

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 20, max_df=0.3)
#Documents = open('%s/CORPUS.txt'%dir, 'r').readlines()
A = vectorizer.fit_transform(Documents)
terms = vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 2088 distinct terms


In [243]:
# save model
from sklearn.externals import joblib
joblib.dump(vectorizer, 'vectorizer.pkl') 
v = joblib.load('vectorizer.pkl') 

['vectorizer.pkl']

In [66]:
import operator
def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

In [67]:
ranking = rank_terms( A, terms )
for i, pair in enumerate( ranking[0:20] ):
    print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

01. python (3563.00)
02. learning (3271.00)
03. model (3214.00)
04. run (3055.00)
05. file (2719.00)
06. new (2242.00)
07. number (2127.00)
08. project (2092.00)
09. set (1967.00)
10. example (1948.00)
11. machine (1930.00)
12. build (1906.00)
13. function (1906.00)
14. image (1906.00)
15. program (1791.00)
16. version (1747.00)
17. time (1726.00)
18. write (1718.00)
19. create (1715.00)
20. need (1712.00)


In [68]:
from sklearn import decomposition
k = 4
model = decomposition.NMF(init="nndsvd", n_components=k ) 
# apply the model and extract the two factor matrices
W = model.fit_transform(A).round(3)
H = model.components_
W.shape

(3522, 4)

In [69]:
term_index = terms.index('time')
# round to 2 decimal places for display purposes
H[:,term_index].round(2)

array([0.79, 2.58, 0.01, 1.79])

In [70]:
import numpy as np
def get_descriptor( terms, H, topic_index, top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( terms[term_index] )
    return top_terms

In [71]:
descriptors = []
for topic_index in range(k):
    descriptors.append( get_descriptor( terms, H, topic_index, 10 ) )
    str_descriptor = ", ".join( descriptors[topic_index] )
    print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )

Topic 01: array, function, object, return, value, string, reduce, number, element, argument
Topic 02: best, docker, list, git, star, web, free, security, build, source
Topic 03: program, write, number, array, string, element, sum, decimal, integer, letter
Topic 04: model, learning, training, set, machine, deep, neural, python, run, example


In [247]:
model.transform(vectorizer.transform([token for token in simple_preprocess(text) if token not in STOPWORDS and token in EngWords]))

array([[6.58187758e-04, 1.49582631e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.18141303e-03, 6.36874055e-04, 0.00000000e+00],
       [1.06355310e-03, 4.58930790e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.18141303e-03, 6.36874055e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.16696832e-04, 1.46538317e-04, 1.79728463e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [6.58187758e-04, 1.49582631e-04, 0.00000000e+00],
       [0.00000000e+00, 6.64365160e-04, 0.00000000e+00],
       [1.55401805e-04, 3.05490558e-04, 0.00000000e+00],
       [3.60335247e-03, 8.90080897e-04, 1.12211936e-04],
       [1.15610671e-03, 1.98196358e-04, 0.00000000e+00],
       [4.40025189e-04, 1.38665584e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000