In [1]:
# Topic Modeling

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

import numpy as np
import pandas as pd

Using Theano backend.


In [14]:
####

In [148]:
class LDA:
    '''A class that takes a list or pandas Series of strings as input and outputs a trained LDA model'''
    # Credit to Matt Brems' LDA lecture for the LDA basics
    
    def __init__(self, num_topics=5, passes=20):
        # Number of topics to find
        self.num_topics = num_topics
        
        # Number of passes over the data to make. More passes will ensure the convergence on the 'correct' 
        #  latent distribution of topics across documents and words across topics.
        self.passes = passes
        
        # Initialize the tokenizer object
        self.tokenizer = RegexpTokenizer(r'\w+')

        # Fetch an English stop words list from the NLTK package
        self.en_stop = get_stop_words('en')

        # Initialize a 'stemmer' object which will reduce words to 'stems'
        self.stemmer = PorterStemmer()    

    def transform(self, text_series):
        '''Transforms a series of texts into a dictionary and a corpus, both saved as attributes of the object'''
        self.text_series = text_series
        
        # Initialize empty list to contain tokenized strings
        tokenized_text = []
        
        # Loop through text_series
        for text in text_series:

            # Turn each string into a series of lowercase words
            raw = text.lower()
            tokens = self.tokenizer.tokenize(raw)

            # Remove stop words
            tokens = [text for text in tokens if not text in self.en_stop]

            # Turn words into 'stems,' to reduce the total number of unique words
            tokens = [self.stemmer.stem(text) for text in tokens]

            # Remove strings shorter than 4 elements
            tokens = [text for text in tokens if len(text) > 3]

            # Add tokens to list
            tokenized_text.append(tokens)

        # Create a id:term dictionary from our tokenized series of strings
        self.dictionary = corpora.Dictionary(tokenized_text)

        # Create a document-term matrix from our tokenized series of strings
        self.corpus = [self.dictionary.doc2bow(text) for text in tokenized_text]   
        
     
    def train_model(self):
        '''Train the model. Uses Gensims multiple core version of the LDA model.''' 
        self.model = gensim.models.ldamulticore.LdaMulticore(self.corpus, num_topics=self.num_topics, id2word = self.dictionary, passes=self.passes)

In [175]:
def fetch_topic_string(topic, n_words=5, join=True):
    '''Return a list of words charcterizing each topic'''
    topic_words = [LDA.model.show_topic(topic)[i][0] for i in range(n_words)]
    if join:
        topic_words = ' '.join(topic_words)
    return(topic_words)

def fetch_doc_topic(document, n_words=5, num_topics=5):
    '''Return the topic most represented by a text. Minimum string length (for error handling) is 5.'''
    if type(document) != str:
        return('')
    if len(document) < 5:
        return('')
    probs = LDA.model[LDA.dictionary.doc2bow(document.split())]
    probs = [probs[i][1] for i in range(num_topics)]
    topic = np.argmax(probs)
    return(fetch_topic_string(topic, n_words=n_words))

In [161]:
# Read in the data. Be careful with encoding! There are strang characters.
facebook = pd.read_csv('assets/facebook_data.csv', encoding='ISO-8859-1', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [162]:
def safe_string_add(*args):
    '''Safely adds multiple strings, ignores non-string inputs.'''
    string = ''
    for arg in args:
        if type(arg) == str:
            string += ' ' + arg
    return(string)        
    
# Create a column of all the text from each FB post
facebook['Text'] = [safe_string_add(facebook['media_title'][i],
         facebook['message'][i]) for i in range(facebook.shape[0])]


In [163]:
# Identify the unique brands represented
brands = facebook['brand_name'].unique()

In [164]:
# Initiate new dataframe to store data.
facebook_topics = pd.DataFrame(columns=list(facebook.columns) + ['Topic'])

In [165]:
# For each brand, train an LDA model and assign each observation to one of 5 topics. Append to pre-existing dataframe.
for brand in brands:
    try:
        brand_data = facebook[facebook['brand_name'] == brand]

        LDA = LDA(num_topics=5, passes=20)
        LDA.transform(brand_data['Text'])
        LDA.train_model()
        print(brand + ' analyzed.')
        brand_data['Topic'] = [fetch_doc_topic(text, num_topics=5) for text in brand_data['Text']]
        facebook_topics = facebook_topics.append(brand_data)
        facebook_topics.to_csv('assets/fb_w_topics.csv')
    except:
        pass

Glamor analyzed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Teen_Vogue analyzed.
Vanity_Fair analyzed.
W_Magazine analyzed.
Vogue analyzed.
Onself analyzed.
Conde_Naste_Traveler analyzed.
Clever analyzed.
Allure analyzed.


In [None]:
brand_data = facebook[facebook['brand_name'] == 'Glamor']

LDA = LDA(num_topics=5, passes=100)
LDA.transform(brand_data['Text'])
LDA.train_model()
print(brand + ' analyzed.')

In [194]:
fetch_topic_string(0, n_words = 10)

'women trump glamour photo http glmr just self magazin woman'

In [None]:
brand_data['Topic'] = 

In [181]:
[fetch_doc_topic(text, num_topics=5) for text in brand_data['Text']]

IndexError: list index out of range

In [179]:
fetch_doc_topic('trump clinton', num_topics=5)

'women trump glamour photo http'

In [170]:
pyLDAvis.display(vis)