<a href="https://colab.research.google.com/github/somilasthana/MachineLearningSkills/blob/master/Scratch_Linear_Dirichlet_Allocation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
"""
Generative model

Theory:
LDA is based on the idea that words often have strong semantic relationships to 
certain topics, and so topics in a given document will consist of a group of similar words.

LDA requires us to pick the number of topics for it to discover, 
LDA assumes that a document is a mixture of a set of latent (unknown) topics, 
and each topic is another mixture of words.

Outputs the words in the text corpus (a set of documents) that frequently occur 
together within the topic.

LDA assumes the documents are generated through some statistical process.
Given a document d is a text corpus D, d is generated as

1. Number of words in document d, represented by N_d, is drawn from poisson distribution.
2. The mixture of topics in document d, represented by theta_d, is drawn from dirichlet distribution.
3. Assign each word w_i a topic z_i, in a way so that it is consistent with the document-topic distribution in 2)
4. Now that we know the topic z_i of each word w_i. from the topic-word distribution

LDA assumes a document is a mixture of topics, where the topics are drawn from 
the topic-document distribution, and topics consist of words, where the words 
are drawn from the topic-word distribution.


Interested in finding the distribution of topics for each document, and the 
distribution of words for each topic.


"""

In [0]:
import numpy as np
import nltk
import pandas as pd
import re
from __future__ import division
np.random.seed(42)

In [0]:
!wget https://raw.githubusercontent.com/hammadshaikhha/Math-of-Machine-Learning-Course-by-Siraj/master/Latent%20Dirichlet%20Allocation/YoutubeCommentsSpam.csv

In [84]:
comments = pd.read_csv("/content/YoutubeCommentsSpam.csv")
comments.head()

Unnamed: 0,commentText
0,+447935454150 lovely girl talk to me xxx
1,I always end up coming back to this song<br />
2,"my sister just received over 6,500 new <a rel=..."
3,Cool
4,Hello I am from Palastine


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [85]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
eng_stop = [str(word) for word in stopwords.words('english')]
eng_stop[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
from nltk.stem.porter import PorterStemmer

p_stemmer = PorterStemmer()

# Text data to itterate over
text_data = [line for line in comments["commentText"] if line != '']

for line in range(len(text_data)):
  raw_lower = text_data[line].lower()
  line_token = tokenizer.tokenize(raw_lower)
  clean_token = [re.sub(r'[^a-zA-Z]','', word) for word in line_token]
  stop_token = [word for word in clean_token if not word in eng_stop if word != '']
  stem_token = [str(p_stemmer.stem(word)) for word in stop_token]
  text_data[line] = stem_token

In [116]:
text_data[0:2]

[['love', 'girl', 'talk', 'xxx'],
 ['alway', 'end', 'come', 'back', 'song', 'br']]

In [117]:
words_list = [words for sublist in text_data for words in sublist]
vocab_total = set(word_list)
list(vocab_total)[1:7]

['officialpsi', 'straight', 'enjoy', 'turn', 'georg', 'gjvinpuemo']

In [0]:
# Convert each comment into a vector by replacing the words by their unique ID

text_ID = []

for line in range(len(text_data)):
  comment_vector = [ list(vocab_total).index(word)  for word in text_data[line]]
  text_ID.append(comment_vector)

In [119]:
text_data[1], text_ID[1]

(['alway', 'end', 'come', 'back', 'song', 'br'],
 [1355, 197, 640, 414, 3249, 1990])

In [0]:
# Initialize hyperparameters in LDA

# Dirichlet parameters
# Alpha is the parameter for the prior topic distribution within documents
alpha = 0.2

# Beta is the parameter for the prior topic distribution within documents
beta = 0.001

# Text corpus itterations
corpus_itter = 200

# Number of topics
K = 2

# Vocabulary size
V = len(vocab_total)

# Number of Documents
D = len(text_ID)


In [0]:
# For practical implementation, we will generate the following three count matrices:
# 1) Word-Topic count matrix, 2) Topic-Document assignment matrix, 3) Document-Topic count matrix


word_topic_count = np.zeros((K,V))
topic_doc_assign = [np.zeros(len(sublist)) for sublist in text_ID]
doc_topic_count = np.zeros((D, K))

In [0]:
# Generate word-topic count matrix with randomly assigned topics

for d in range(D):
  for w in range(len(text_ID[d])):
    topic_doc_assign[d][w] = np.random.choice(K, 1)
    # topic
    k = int(topic_doc_assign[d][w])
    w_id = text_ID[d][w]
    
    word_topic_count[k][w_id] += 1

In [123]:
word_topic_count

array([[5., 0., 1., ..., 0., 1., 0.],
       [2., 1., 1., ..., 1., 1., 1.]])

In [0]:
# Find out document-topic count matrix based on above topics

for d in range(D):
  k_vector = topic_doc_assign[d]
  for k in range(K):
    doc_topic_count[d][k] = sum(k_vector == k)
    

In [125]:
doc_topic_count[0:5]

array([[ 0.,  4.],
       [ 4.,  2.],
       [ 8., 14.],
       [ 0.,  1.],
       [ 1.,  1.]])

In [126]:
#np.array([doc_topic_count[d][k] for k in range(K)] ) + alpha

np.array([word_topic_count[k][197] + beta for k in range(K)])

array([4.001, 5.001])

In [0]:
for itr in range(corpus_itter):
  for d in range(D):
    for w in range(len(text_ID[d])):
      
      prev_topic_assign = int(topic_doc_assign[d][w])
      
      w_id = text_ID[d][w]
            
      doc_topic_count[d][prev_topic_assign] -= 1
      word_topic_count[prev_topic_assign][w_id] -= 1
      
      # Denominator in first term (Numb. of words in doc + numb. topics * alpha)
      denom1 = sum(doc_topic_count[d]) + K*alpha # doc_topic_count[0]->[ 3.,  1.] denom1=4.4
      
      # Denominator in second term (Numb. of words in topic + numb. words in vocab * beta)
      denom2 = np.sum(word_topic_count, axis = 1) + V*beta # output : array([9312.393, 9217.393])
      
      # Numerators, number of words assigned to a topic + prior dirichlet param
      numerator1 = np.array([doc_topic_count[d][k]+alpha for k in range(K)])
      
      numerator2 = np.array([word_topic_count[k][w_id] + beta for k in range(K)])
      
      prob_topics = (numerator1/denom1)*(numerator2/denom2)
      prob_topics = prob_topics/sum(prob_topics)
      
      update_topic_assign = np.random.choice(K,1,list(prob_topics))
      topic_doc_assign[d][w] = update_topic_assign
      
      # Add in current word back into count matrixes
      
      doc_topic_count[d][update_topic_assign[0]] += 1
      word_topic_count[update_topic_assign[0]][w_id] += 1

In [0]:
theta = (doc_topic_count + alpha)
theta_sum = np.sum(theta, axis=1) # row wise sum
theta = theta/theta_sum.reshape((D,1))

In [62]:
# Print document-topic mixture
print('Subset of document-topic mixture matrix: \n%s' % theta[3:6])

# Spam comment
print( 'Comment {0},\n topic distribution {1}'.format(" ".join(text_data[12]), theta[12]))

Subset of document-topic mixture matrix: 
[[0.14285714 0.85714286]
 [0.5        0.5       ]
 [0.2972973  0.7027027 ]]
Comment alright ladi like song check john rage smoke hot rapper come game better eminem lyric hotter hear song channel,
 topic distribution [0.5257732 0.4742268]


In [66]:
print( 'Comment {0},\n topic distribution {1}'.format(" ".join(text_data[15]), theta[15]))

Comment href http www facebook com group http www facebook com group,
 topic distribution [0.71929825 0.28070175]


In [0]:
# Compute posterior mean of word-topic distribution within documents
phi = (word_topic_count + beta)
phi_row_sum = np.sum(phi, axis = 1)
phi = phi/phi_row_sum.reshape((K,1))

In [0]:
# Explore the top words that make up each topic 

# Initialize list of dictionaries

list_dict_topics = []

# Loop over topics
for topic in range(K):
    
    # Initialize (vocab,prob) dictionary
    mydict = {}
    
    # Loop over vocabular
    for word in range(V):
        
        # Create dictionary {(vocab,prob)}
        mydict[list(vocab_total)[word]] = phi[topic][word]
        
    # Create list of dictionaries
    list_dict_topics.append(mydict)

In [69]:
sorted([(value,key) for (key,value) in list_dict_topics[0].items()])[::-1][10:30]

[(0.011168020937260702, 'pleas'),
 (0.010738485800588529, 'channel'),
 (0.0086981939013957, 'music'),
 (0.007302204707211133, 'make'),
 (0.0069800533547070024, 'view'),
 (0.006872669570538959, 'www'),
 (0.006765285786370914, 'new'),
 (0.006657902002202871, 'get'),
 (0.006657902002202871, 'amp'),
 (0.006443134433866783, 'guy'),
 (0.006013599297194609, 'thank'),
 (0.005369296592186348, 'watch'),
 (0.005047145239682217, 'kati'),
 (0.004939761455514173, 'comment'),
 (0.004939761455514173, 'best'),
 (0.004832377671346129, 'quot'),
 (0.004724993887178086, 'money'),
 (0.004617610103010043, 'peopl'),
 (0.004188074966337868, 'year'),
 (0.004188074966337868, 'know')]

In [70]:
sorted([(value,key) for (key,value) in list_dict_topics[1].items()])[::-1][10:30]

[(0.012368030743616986, 'love'),
 (0.010849163098503015, 'channel'),
 (0.008679352176911629, 'thank'),
 (0.00835388053867292, 'guy'),
 (0.00824538999259335, 'music'),
 (0.008136899446513782, 'get'),
 (0.008028408900434212, 'amp'),
 (0.007594446716115935, 'make'),
 (0.007485956170036365, 'view'),
 (0.007051993985718088, 'www'),
 (0.00661803180139981, 'comment'),
 (0.0064010507092406715, 'watch'),
 (0.0064010507092406715, 'new'),
 (0.005316145248444978, 'hey'),
 (0.005099164156285839, 'money'),
 (0.0048821830641267005, 'quot'),
 (0.0048821830641267005, 'peopl'),
 (0.004448220879808422, 'go'),
 (0.004231239787649284, 'facebook'),
 (0.004122749241569715, 'see')]