In [148]:
# Topic Modeling on Track Maven Social Media Data
#
# Ben Shaver
# December 2017

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import pyLDAvis.gensim

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
# Topic modeling is a form of unsupervised learning. Like other unsupervised learning techniques, it learns patterns in a set of unlabelled 
# data, or data without a target variable to be predicted. In this case, the patterns learned are latent 'topics' that appear in a set of 
# texts, or documents. 
# Latent Dirichlet Allocation, or LDA, is a form of topic modeling that assumes documents are just bags of words, ignoring syntax and grammar.
# LDA assumes documents are a mix of topics. Each word in a document belongs to each topic with a fixed probabiltiy unique to that document, and
# each topic in turn returns a given word with a set of unique probabilities unique to that topic.
# The purpose of LDA is to approximate the assumed 'latent' distribution which represents the mix of topics across documents AND the mix of
# words across topics. Once the LDA model is trained, it can be used to compute the mix of topics for a particular document, and to compute
# the mix of words per topic. Note that each word is not unique to a topic, but merely more likely to appear for a given topic.
# 
# Below, I wrap the Python package Gensim's functionality into a class and some helper functions in order to train a model on social media
# data from Track Maven. For each brand within the Conde Nast umbrella, LDA trains a model on the unique corpus of its facebook posts
# (combining the different text fields first). Then a topic is assigned to each observation according to the topic most highly represnted
# by that post. Finally, a CSV file is saved which is identical to the file read in except 'text' and 'topic' columns have been added.

In [19]:
class LDA:
    '''A class that takes a list or pandas Series of strings as input and outputs a trained LDA model'''
    # Credit to Matt Brems' LDA lecture for the LDA basics
    
    def __init__(self, num_topics=5, passes=20):
        # Number of topics to find
        self.num_topics = num_topics
        
        # Number of passes over the data to make. More passes will ensure the convergence on the 'correct' 
        #  latent distribution of topics across documents and words across topics.
        self.passes = passes
        
        # Initialize the tokenizer object
        self.tokenizer = RegexpTokenizer(r'\w+')

        # Fetch an English stop words list from the NLTK package
        self.en_stop = get_stop_words('en')

        # Initialize a 'stemmer' object which will reduce words to 'stems'
        self.stemmer = PorterStemmer()    

    def transform(self, text_series):
        '''Transforms a series of texts into a dictionary and a corpus, both saved as attributes of the object'''
        self.text_series = text_series
        
        # Initialize empty list to contain tokenized strings
        tokenized_text = []
        
        # Loop through text_series
        for text in text_series:

            # Turn each string into a series of lowercase words
            raw = text.lower()
            tokens = self.tokenizer.tokenize(raw)

            # Remove stop words
            tokens = [text for text in tokens if not text in self.en_stop]

            # Turn words into 'stems,' to reduce the total number of unique words
            tokens = [self.stemmer.stem(text) for text in tokens]

            # Remove strings shorter than 4 elements
            tokens = [text for text in tokens if len(text) > 3]

            # Add tokens to list
            tokenized_text.append(tokens)

        # Create a id:term dictionary from our tokenized series of strings
        self.dictionary = corpora.Dictionary(tokenized_text)

        # Create a document-term matrix from our tokenized series of strings
        self.corpus = [self.dictionary.doc2bow(text) for text in tokenized_text]   
        
     
    def train_model(self):
        '''Train the model. Uses Gensims multiple core implementation of the LDA model.''' 
        self.model = gensim.models.ldamulticore.LdaMulticore(self.corpus, num_topics=self.num_topics, id2word = self.dictionary, passes=self.passes)

In [31]:
def fetch_topic_string(topic, n_words=5, join=True):
    '''Return a list of words charcterizing each topic'''
    topic_words = [lda.model.show_topic(topic)[i][0] for i in range(n_words)]
    if join:
        topic_words = ' '.join(topic_words)
    return(topic_words)

def fetch_doc_topic(document, n_words=5, num_topics=5):
    '''Return the topic most represented by a text. Minimum string length (for error handling) is 5.'''
    if type(document) != str:
        return('')
    if len(document) < 5:
        return('')
    probs = lda.model[lda.dictionary.doc2bow(document.split())]
    probs = [probs[i][1] for i in range(len(probs))]
    topic = np.argmax(probs)
    return(fetch_topic_string(topic, n_words=n_words))

In [6]:
# Read in the data. Be careful with encoding! There are strange characters.
facebook = pd.read_csv('assets/facebook_data.csv', encoding='ISO-8859-1', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
def safe_string_add(*args):
    '''Safely adds multiple strings, ignores non-string inputs.'''
    string = ''
    for arg in args:
        if type(arg) == str:
            string += ' ' + arg
    return(string)        
    
# Create a column of all the text from each FB post
facebook['Text'] = [safe_string_add(facebook['media_title'][i],
         facebook['message'][i]) for i in range(facebook.shape[0])]

# If you'd like to do a similar analysis for Instagram data, 
# simply import the IG data and combine all text fields in a 'Text' column.


In [8]:
# Identify the unique brands represented
brands = facebook['brand_name'].unique()

# Replace 'facebook' with 'intsagram' here and below...

In [9]:
# Initiate new dataframe to store data.
facebook_topics = pd.DataFrame(columns=list(facebook.columns) + ['Topic'])

# For each brand, train an LDA model and assign each observation to one of 5 topics. Append to pre-existing dataframe.
# This will take a while.

# for brand in brands:
#     try:
#         brand_data = facebook[facebook['brand_name'] == brand]

#         lda = LDA(num_topics=5, passes=20)
#         lda.transform(brand_data['Text'])
#         lda.train_model()
#         print(brand + ' analyzed.')
#         brand_data['Topic'] = [fetch_doc_topic(text, num_topics=5) for text in brand_data['Text']]
#         facebook_topics = facebook_topics.append(brand_data)
#         facebook_topics.to_csv('assets/fb_w_topics.csv')
#     except:
#         pass

In [140]:
# Teen Vogue has the most FB posts, by a significant margin. Let's focus on Teen Vogue, and try and determine how to
# develop distinct topics (not too many, not too few) and a sensible name for a topic.
brand = 'Teen_Vogue'
brand_data = facebook[facebook['brand_name'] == brand]

num_topics = 5

lda = LDA(num_topics=num_topics, passes=30)
lda.transform(brand_data['Text'])
lda.train_model()

print(brand + ' analyzed.')

Teen_Vogue analyzed.


In [141]:
def fetch_doc_topics(document, n_words=5, num_topics=5):
    '''Return the topic most represented by a text. Minimum string length (for error handling) is 5.'''
    if type(document) != str:
        return([1/num_topics]*num_topics)
        # If the document is not a string, there is a uniform likelihood across all topics
    if len(document) < 5:
        return([1/num_topics]*num_topics)
        # If the document is fewer than 5 characters, lets also say it could be from any topic
    probs = lda.model[lda.dictionary.doc2bow(document.split())]
    # Returns num_topics (topic,probability) tuples
    probs = [item[1] for item in probs] # Extract just the probabilities

    return(probs)

# Construct data frame of probabilities corresponding to each topic:
foo = pd.DataFrame([fetch_doc_topics(doc) for doc in brand_data['Text']]) 
# Add a row identifying the most likely category for each.
foo['Main_topic'] = foo.idxmax(axis=1)

In [142]:
brand_data.reset_index(inplace=True) # Reset index of original data frame and:
brand_data = pd.concat([brand_data, foo], axis=1) # Concatenate together column-wise

In [143]:
raw_text = [brand_data.loc[brand_data['Main_topic'] == i,'Text'].str.cat(sep = ';') for i in range(num_topics)]

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [145]:
# Initialize TfIdf vectorizer, with encoding to handle strange values.
# We're goint to look at the most 'distinctive' 2 or 3 word phrases across each of the raw strings with all the text from each topic
# So, for example, we can discover that the two word phrase 'Selena Gomez' is very distinctive and unique to only one of the topics.
tf = TfidfVectorizer(encoding='ISO-8859-1', ngram_range=(2,4), max_features = num_topics*3)

In [182]:
# Returns the most 'distinctive' 2-, 3-, or 4- word phrase for each topic, and sets as column name
topic_names = [tf.get_feature_names()[np.argmax(i)] for i in tf.fit_transform(raw_text).todense()]

In [195]:
# Assign generated topic names.

topic_dict = dict({(i,topic_names[i]) for i in range(num_topics)})

brand_data = brand_data.rename(columns = topic_dict)

In [198]:
brand_data.to_csv('assets/TeenVogueFB_wTopics.csv')

In [196]:
#### To be carried on in separate 'modeling' script:

In [178]:
from xgboost import XGBClassifier

In [186]:
brand_data['impact'].quantile([.3,.7])

# Build target variable here.

0.3    0.098330
0.7    0.447361
Name: impact, dtype: float64