In [None]:
"""
In this notebook we discuss LDA algorithm and 
how it can be used to identify topics within documents. 
In particular, we apply the LDA algorithm for topic analysis 
on a youtube spam classification comments data set and 
find that LDA discovered the topic 1 "spam" and 
topic 2 "legitimate" on its own without being provided labels. 
This is because we noticed that comments that were over 90% topic 1 
seemed to be spam, and comments that were over 90% topic 2 
seemed to be leigitimate. However the first 10 commonly appearing words 
in both the topics seem to over lap, this may be improved by applying 
TF-IDF in which frequently appearing words will receive smaller weight.
"""

In [None]:
# For matrix operations
import numpy as np
# For data processing
import pandas as pd
# For text processing
import nltk 
# For regular expressions
import re
# For dividing numbers
#from __future__ import division

# Set seed so we get same random allocation on each run of code
np.random.seed(2017)

In [None]:
# Load text data
comments = pd.read_csv("YoutubeCommentsSpam.csv")

# Let's take a look at the first few rows
print comments.head()

In [None]:
# Load tokenizer function
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Tokenizing first observation
tokenize_obs = tokenizer.tokenize(comments["commentText"][1])

# Example of tokenizing first observation
print('Tokenize first observation: \n%s' % tokenize_obs)

# Load list of common stop words
from stop_words import get_stop_words

# Create English stop words list
eng_stop = [str(word) for word in get_stop_words('english')]

# Print a few stop words
print('Stop words in english: \n%s' % eng_stop[1:10])

In [None]:
# Import function for stemming text
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# Text data to itterate over
text_data = [line for line in comments["commentText"] if line != '']

# Convert text data into a list of comments after stop words and stemming are accounted for
for line in range(len(comments)):
    
    # Convert comment all to lower case
    raw_lower = text_data[line].lower()
    
    # Tokenize comment
    line_token = tokenizer.tokenize(raw_lower)
    
    # Only keep letters in comments
    clean_token = [re.sub(r'[^a-zA-Z]','', word) for word in line_token]
    
    # Take out stop words
    stop_token = [word for word in clean_token if not word in eng_stop if word != '']
    
    # Take out stem words
    stem_token = [str(p_stemmer.stem(word)) for word in stop_token]
    
    # Replace comment with cleaned list of words
    text_data[line] = stop_token

In [None]:
# Need to convert our list of list data into single list
#  if words != ''
words_list = [words for sublist in text_data for words in sublist]

# Vocabulary is the set of unique words used
vocab_total = set(words_list)

# Take a look at few words
print('Few words from vocabulary list: \n%s' % list(vocab_total)[1:7])

# Size of vocabulary list 
print('Number of unique words in data: \n%s' % len(vocab_total))

In [None]:
# Convert each comment into a vector by replacing the words by their unique ID
text_ID = []

# Loop over cleaned text data
for line in range(len(text_data)):
    
    # Append comment replaced by unique word IDs
    comment_vector = [list(vocab_total).index(words) for words in text_data[line]]
    text_ID.append(comment_vector)

# Let's check the first comment
print ("The first comment (after processing) is: \n%s" % text_data[0])
print('First comment as a vector of word IDs is: \n%s' % text_ID[0])

In [None]:
# Initialize hyperparameters in LDA

# Dirichlet parameters
# Alpha is the parameter for the prior topic distribution within documents
alpha = 0.2

# Beta is the parameter for the prior topic distribution within documents
beta = 0.001

# Text corpus itterations
corpus_itter = 200

# Number of topics
K = 2

# Vocabulary size
V = len(vocab_total)

# Number of Documents
D = len(text_ID)

# For practical implementation, we will generate the following three count matrices:
# 1) Word-Topic count matrix, 2) Topic-Document assignment matrix, 3) Document-Topic count matrix

# Initialize word-topic count matrix (size K x V, K = # topics, V = # vocabulary)
word_topic_count = np.zeros((K,V))

# Initialize topic-document assignment matrix
topic_doc_assign = [np.zeros(len(sublist)) for sublist in text_ID] 

# Initialize document-topic matrix
doc_topic_count = np.zeros((D,K))

In [None]:
# Generate word-topic count matrix with randomly assigned topics

# Loop over documents
for doc in range(D):
    
    # Loop over words in given document
    for word in range(len(text_ID[doc])):

        # Step 1: Randomly assign topics to each word in document
        # Note random.choice generates number {0,...,K-1}
        topic_doc_assign[doc][word] = np.random.choice(K,1)

        # Record word-topic and word-ID
        word_topic = int(topic_doc_assign[doc][word])
        word_doc_ID = text_ID[doc][word]
        
        # Increment word-topic count matrix
        word_topic_count[word_topic][word_doc_ID] += 1

# Print word-topic matrix
print('Word-topic count matrix with random topic assignment: \n%s' % word_topic_count)

In [None]:
# Generate document-topic count matrix with randomly assigned topics

# Loop over documents (D = numb. docs)
for doc in range(D):
    
    # Loop over topics (K = numb. topics)
    for topic in range(K):
        
        # topic-document vector
        topic_doc_vector = topic_doc_assign[doc]
        
        # Update document-topic count
        doc_topic_count[doc][topic] = sum(topic_doc_vector == topic)

# Print document-topic matrix
print('Subset of document-topic count matrix with random topic assignment: \n%s' % doc_topic_count[0:5])

In [None]:
# Main part of LDA algorithm (takes a few minutes to run)
# Run through text corpus multiple times
for itter in range(corpus_itter):
    
    # Loop over all documents
    for doc in range(D):
        
        # Loop over words in given document
        for word in range(len(text_ID[doc])):
            
            # Initial topic-word assignment
            init_topic_assign = int(topic_doc_assign[doc][word])
            
            # Initial word ID of word 
            word_id = text_ID[doc][word]
            
            # Before finiding posterior probabilities, remove current word from count matrixes
            doc_topic_count[doc][init_topic_assign] -= 1
            word_topic_count[init_topic_assign][word_id] -=1
            
            # Find probability used for reassigning topics to words within documents
            
            # Denominator in first term (Numb. of words in doc + numb. topics * alpha)
            denom1 = sum(doc_topic_count[doc]) + K*alpha
            
            # Denominator in second term (Numb. of words in topic + numb. words in vocab * beta)
            denom2 = np.sum(word_topic_count, axis = 1) + V*beta
            
            # Numerators, number of words assigned to a topic + prior dirichlet param
            numerator1 = [doc_topic_count[doc][col] for col in range(K)] 
            numerator1 = np.array(numerator1) + alpha
            numerator2 = [word_topic_count[row][word_id] for row in range(K)]
            numerator2 = np.array(numerator2) + beta
            
            # Compute conditional probability of assigning each topic
            # Recall that this is obtained from gibbs sampling
            prob_topics = (numerator1/denom1)*(numerator2/denom2)
            prob_topics = prob_topics/sum(prob_topics)
                                    
            # Update topic assignment (topic can be drawn with prob. found above)
            update_topic_assign = np.random.choice(K,1,list(prob_topics))
            topic_doc_assign[doc][word] = update_topic_assign
            
            # Add in current word back into count matrixes
            doc_topic_count[doc][init_topic_assign] += 1
            word_topic_count[init_topic_assign][word_id] +=1

In [None]:
# Compute posterior mean of document-topic distribution
theta = (doc_topic_count+alpha)
theta_row_sum = np.sum(theta, axis = 1)
theta = theta/theta_row_sum.reshape((D,1))

# Print document-topic mixture
print('Subset of document-topic mixture matrix: \n%s' % theta[0:3])

# Spam comment
print ('Comment is 95 perc. topic 1, and 5 perc. topic 2: \n%s' % theta[10])
print ('Comment looks like its spam: \n%s' % comments["commentText"][10])

In [None]:
# Spam comment
print ('Comment is 92 perc. topic 1, and 8 perc. topic 2: \n%s' % theta[4])
print ('Comment seems to be spam: \n%s' % comments["commentText"][4])

In [None]:
# Non-spam comment
print ('Comment is 8 perc. topic 1 and 92 perc. topic 2: \n%s' % theta[11])
print ('Comment seems ligitimate: \n%s' % comments["commentText"][11])

In [None]:
# Non-spam comment
print ('Comment is is 5 perc. topic 1 and 95 perc. topic 2: \n%s' % theta[18])
print ('Comment seems to be about video, non-spam: \n%s' % comments["commentText"][18])

In [None]:
# Compute posterior mean of word-topic distribution within documents
phi = (word_topic_count + beta)
phi_row_sum = np.sum(phi, axis = 1)
phi = phi/phi_row_sum.reshape((K,1))


# Print topic-word mixture
print('Topic-word mixture matrix: \n%s' % phi)

In [None]:
# Explore the top words that make up each topic 

# Initialize list of dictionaries
list_dict_topics = []

# Loop over topics
for topic in range(K):
    
    # Initialize (vocab,prob) dictionary
    mydict = {}
    
    # Loop over vocabular
    for word in range(V):
        
        # Create dictionary {(vocab,prob)}
        mydict[list(vocab_total)[word]] = phi[topic][word]
        
    # Create list of dictionaries
    list_dict_topics.append(mydict)

In [None]:
# First topic
# The first 10 words are ignored, because they most overlap with topic 2
# Commonly appearing words in topic 1
sorted([(value,key) for (key,value) in list_dict_topics[0].items()])[::-1][10:30]

In [None]:
# Second topic
# The first 10 words are ignored, because they most overlap with topic 1
# Commonly appearing words in topic 2
sorted([(value,key) for (key,value) in list_dict_topics[1].items()])[::-1][10:30]