# Topic Modeling with LDA

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gensim
from gensim import corpora, models
import pandas as pd
import pyLDAvis.gensim
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords 
import string
from gensim.models import CoherenceModel
import math
from nltk.corpus import wordnet
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Lets analyze dataset with most upvoted Kaggle datasets. Using datasets' description we will try to understand for every document which topics it consists of. Latent Dirichlet Allocation will be used as a method to find out topics.

In [None]:
df = pd.read_csv('/kaggle/input/voted-kaggle-dataset/voted-kaggle-dataset.csv')
df.head()

Out of all columns we will use only last one - Description.

In [None]:
df.info()

Below we remove all empty values.

In [None]:
descriptions = df[df['Description'].notnull()]['Description']
descriptions.head()

<h3>Data preparation</h3>
Now we want to clean the data. All punctuation is be removed as well as numbers. Words are lemmatized and lowecased.

In [None]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
documents = []
texts = []
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')).union({'data', 'dataset', 'model'})
digits = set(string.digits)
for description in descriptions.iteritems():
    document_str = str(description[1]).lower()
    document_str = document_str.translate(str.maketrans('', '', string.punctuation))
    token_words = word_tokenize(document_str)
    pos_tagged = pos_tag(token_words)
    tokens = [(token, get_wordnet_pos(tag)) for token, tag in pos_tagged]
    lemma_tokens = [lemmatizer.lemmatize(token, tag) for token, tag in tokens]
    document = [w for w in lemma_tokens if (not w in stop_words) and (not w.isdigit()) ]
    documents.append(document)
    texts.append(' '.join(document))
print(documents[0])

Having array that consists of array of tokens. We use it to build a dictionary. After that a document-term matrix is built.

In [None]:
dictionary = corpora.Dictionary(documents)

document_term = [dictionary.doc2bow(document) for document in documents]

<h3>Creating a model</h3>
After adjusting number of topics using coherence score I decided to have 20 topics for LDA model. Given number of topics, document-term matrix and dictionary we create LDA model using genism****.

In [None]:
ldamodel = models.ldamodel.LdaModel(document_term, num_topics=20, id2word = dictionary)


Let's now have a brief look at 5 most popular words for each topic.

In [None]:
ldamodel.print_topics(num_words=5)

<h3>Visualization</h3>
Using pyLDAvis we can look at this nice interactive plot below. Using either hover or buttons we can see how do words appear in all our topics.

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, document_term, dictionary)
# pyLDAvis.display(visualization)

For example for topic 2 the most used word is 'restaurant'. And for topic three there are 'game', 'player', 'team' and so on.

<h3>Calculating metrics</h3>
After looking at visualizations we may want to have some numeric values that can show quality of our topic model. For example we could compare it with other models and so on. Here we calculate Perplexity and Coherence score.

In [None]:
log_perplexity = ldamodel.log_perplexity(document_term)
print('Perplexity:', math.exp(log_perplexity))

In [None]:
coherence_model_lda = CoherenceModel(model=ldamodel, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence score:', coherence_lda)