# Topic Modeling

#### Ryan Bales (@ryanbales)<br>ryan@balesofdata.com

***

### Import Packages

In [1]:
import json
import gensim

import spacy
nlp = spacy.load('en', disable=['parser', 'ner']) 

# Plotting Tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

#### Define Constants

In [2]:
data_path = "data/2016_debates/"
transcription_data_path = "{}{}".format(data_path, "transcripts/")
topic_count = 5
words_per_topic = 10

### Define Helper Functions

In [3]:
def get_text(file_list):
    data = []
    
    for file_path in file_list:
        with open(file_path, "r") as f:
            transcript = json.load(f)
            data.append(transcript["results"]["transcripts"][0]["transcript"])
    
    return data

#### We're using spacy for Lemmatization <br/> You'll need to run the following command to install the enlish package for spacy: python -m spacy download en

In [4]:
nlp = spacy.load("en")

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

### Load Transcription Data

In [5]:
files = [
    "{}{}".format(transcription_data_path, "debate_1.mp3.json"),
    "{}{}".format(transcription_data_path, "debate_2.mp3.json"),
    "{}{}".format(transcription_data_path, "debate_3.mp3.json"),
    "{}{}".format(transcription_data_path, "vp_debate.mp3.json")
]
                       
docs = get_text(files)

### Preprocess Documents and Remove Stop Words

In [6]:
clean_docs_no_stops = []

for doc in docs:
    clean_doc = []
    for word in gensim.utils.simple_preprocess(str(doc)):
        if word not in nlp.Defaults.stop_words:
            clean_doc.append(word)
            
    clean_docs_no_stops.append(clean_doc)

### Do Lemmatization

In [8]:
text_data_clean = lemmatization(clean_docs_no_stops)

### Create Dictionary and Corpus for Modeling

In [9]:
dictionary = gensim.corpora.Dictionary(text_data_clean)
corpus = [dictionary.doc2bow(text) for text in text_data_clean]

### We're Ready to Build the LDA Model

In [10]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=topic_count, random_state=100, update_every=1, 
                                            chunksize=100, passes=10, alpha='auto', per_word_topics=True)

### Show Topic Results

In [11]:
lda_model.print_topics(num_topics=topic_count, num_words=words_per_topic)

[(0,
  '0.001*"say" + 0.001*"people" + 0.001*"want" + 0.001*"go" + 0.001*"country" + 0.001*"know" + 0.001*"ve" + 0.001*"think" + 0.001*"trump" + 0.001*"look"'),
 (1,
  '0.001*"go" + 0.001*"say" + 0.001*"people" + 0.001*"know" + 0.001*"trump" + 0.001*"country" + 0.001*"question" + 0.001*"thing" + 0.001*"look" + 0.001*"ve"'),
 (2,
  '0.016*"go" + 0.013*"say" + 0.012*"people" + 0.011*"want" + 0.010*"think" + 0.010*"country" + 0.009*"trump" + 0.009*"know" + 0.009*"ve" + 0.007*"look"'),
 (3,
  '0.001*"go" + 0.001*"say" + 0.001*"want" + 0.001*"people" + 0.001*"country" + 0.001*"know" + 0.001*"trump" + 0.001*"ve" + 0.001*"think" + 0.001*"work"'),
 (4,
  '0.001*"say" + 0.001*"go" + 0.001*"know" + 0.001*"people" + 0.001*"ve" + 0.001*"want" + 0.001*"trump" + 0.001*"thing" + 0.001*"american" + 0.001*"think"')]

### Visualize the Topic Model

In [14]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
127.0.0.1 - - [07/Jan/2019 02:14:03] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [07/Jan/2019 02:14:03] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [07/Jan/2019 02:14:03] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [07/Jan/2019 02:14:03] "GET /LDAvis.js HTTP/1.1" 200 -



stopping Server...
