## Model the NPR articles with Latent Dirichlet Allocation

  1. Run the LDA model with sklearn (http://scikit-learn.org/stable/modules/decomposition.html#latentdirichletallocation)
  2. Visualize it with pyldavis (https://pyldavis.readthedocs.io/en/latest)

In [20]:
# USE Python3
import numpy as np
import pandas as pd
import warnings, re, string
warnings.filterwarnings('ignore', category=DeprecationWarning, module='.*/IPython/.*')
warnings.filterwarnings('ignore', category=DeprecationWarning, module='pyLDAvis')

import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

pyLDAvis.enable_notebook()

df = pd.read_csv('desc_1000.csv',delimiter='|')

In [21]:
def rem_html(desc):
    return re.sub('<[^<]+?>', '', desc)

def rem_punc(desc):
    table = str.maketrans({key:None for key in string.punctuation})
    return desc.translate(table)

def cleaner(desc):
    return rem_punc(rem_html(desc)).lower()

In [22]:
df['desc_clean'] = np.vectorize(cleaner)(df.desc)

In [23]:
text = df['desc_clean'].values.tolist()

## Vectorize the words

Essentially create a numeric representation of the words based on frequencies

In [24]:
max_features = 1000
tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                max_features=max_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(text)
print("ready")

ready


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


## Run LDA

In [28]:
n_topics = 4
lda_model = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      random_state=0)

lda_model.fit(tf)
pyLDAvis.sklearn.prepare(lda_model,tf, tf_vectorizer, R=20)

In [26]:
def get_top_words(model, feature_names, n_top_words):
    top_words = {}
    for topic_idx, topic in enumerate(model.components_):
        _top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        top_words[str(topic_idx)] = _top_words
    return(top_words)

In [27]:
## get the token to topic matrix
word_topic = np.zeros((max_features,n_topics),)
print(n_topics)
lda_model.components_
for topic_idx, topic in enumerate(lda_model.components_):
    word_topic[:,topic_idx] = topic

print("token-topic matrix",word_topic.shape)

## create a matrix of the top words used to define each topic
top_words = 15
tf_feature_names = np.array(tf_vectorizer.get_feature_names())
top_words = get_top_words(lda_model,tf_feature_names,top_words)
all_top_words = np.array(list(set().union(*[v for v in top_words.values()])))

for key,vals in top_words.items():
    print(key," ".join(vals))
print("total words: %s"%len(all_top_words))

top_word_inds = [np.where(tf_feature_names == tw)[0][0] for tw in all_top_words]

5
token-topic matrix (1000, 5)
0 permit daily person private nature sent access requires later away years isnt day climbing climbers
1 left right climb crack start route face bolt crux good climbing arete bolts holds fun
2 pay roof permadraw permadraws penelope considered says rounded extension pump check start good clipping tight
3 quickly lunge climb start doesnt adds technique length surprisingly offwidth difficulty left starts crux good
4 ice beginner add standing page link useful use start glacier snow park route left climb
total words: 63
