<a href="https://colab.research.google.com/github/poffertje/TextMining/blob/master/code/topic_modelling/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Topic Modelling with LDA

## Mounting the Drive (Google Colab)

In [None]:
# from google.colab import drive

# drive.mount("/content/gdrive")

In [None]:
#! pip install pyLDAvisc

## Importing The Packages

In [1]:
from pathlib import Path
from pprint import pprint

import warnings
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import tqdm
import gensim
import nltk
import numpy as np
import pandas as pd
from gensim import corpora, models
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess


from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lmps\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
pd.options.display.max_rows = 15
np.set_printoptions(precision=4, suppress=True)
# Filter out the irrelevant warnings
warnings.filterwarnings("ignore")
# Plotting
%matplotlib inline
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
# sns.set(style='whitegrid', palette='muted', font_scale=1.2)
FIG_SIZE = (12, 9)

## Resolving Paths

### Google Colab

In [None]:
# CUR_DIR = (
#     Path().resolve()
# )  # this should provide you with the folder in which this notebook is placed
# # use this for colab
# PATH_TO_DATASETS = Path.joinpath(CUR_DIR, "gdrive/Shareddrives/Minecraft/Datasets")
# print(PATH_TO_DATASETS)
# print("Does path exist? ->", Path.exists(PATH_TO_DATASETS))

# # same for colab and local repository
# PATH_TO_YELP = Path.joinpath(PATH_TO_DATASETS, "sentiment_sample_50_50.csv")
# print(PATH_TO_YELP)
# print("Does path exist? ->", Path.exists(PATH_TO_YELP))

### Local Repository

In [3]:
CUR_DIR = (
    Path().resolve()
)  # this should provide you with the folder in which this notebook is placed
# use this for local repository
PATH_TO_DATASETS = Path.joinpath(CUR_DIR.parents[1], "datasets")
print(PATH_TO_DATASETS)
print("Does path exist? ->", Path.exists(PATH_TO_DATASETS))

# same for colab and local repository
PATH_TO_YELP = Path.joinpath(PATH_TO_DATASETS, "sentiment_sample_50_50.csv")
print(PATH_TO_YELP)
print("Does path exist? ->", Path.exists(PATH_TO_YELP))

C:\Users\lmps\github\TextMining2\datasets
Does path exist? -> True
C:\Users\lmps\github\TextMining2\datasets\sentiment_sample_50_50.csv
Does path exist? -> True


## Data preprocessing

In [None]:
yelp_100k = pd.read_csv(PATH_TO_YELP)

### Stemming

In [None]:
stemmer = nltk.stem.SnowballStemmer("english")


def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos="v")


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
processed_docs = yelp_100k["review"].map(preprocess)

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

Filter out tokens that appear in less than 15 documents or more than 0.5 documents (fraction of total corpus size). Also, keep only the first 100000 most frequent tokens.

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

For each document we create a dictionary reporting how many words and how many times those words appear.

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
# bow_corpus1 = [dictionary.doc2bow(doc) for doc in proc]

Create tf-idf model object using models

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

## Hyperparameter tuning

In [None]:
# supporting function
def compute_coherence_values(
    corpus,
    dictionary,
    k
):

    lda_model = gensim.models.LdaMulticore(
        corpus=corpus, id2word=dictionary, num_topics=k, random_state=100, passes=2
    )

    coherence_model_lda = CoherenceModel(
        model=lda_model, texts=processed_docs, dictionary=dictionary, coherence="c_v"
    )

    return coherence_model_lda.get_coherence()

Grid Search for best params

In [None]:
grid = {}
grid["Validation_Set"] = {}
# Topics range
min_topics = 10
max_topics = 31
step_size = 10
topics_range = range(min_topics, max_topics, step_size)


# Validation sets
# num_of_docs = len(bow_corpus)
corpus_sets = [corpus_tfidf, bow_corpus]
corpus_title = ["TF-IDF", "Bag of Words"]
model_results = {"Corpus_Type": [], "Topics": [], "Coherence": []}

# Can take a long time to run
# iterate through validation corpuses
for i in range(len(corpus_sets)):
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        # get the coherence score for the given parameters
        cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, k=k)
        # Save the model results
        model_results["Corpus_Type"].append(corpus_title[i])
        model_results["Topics"].append(k)
        # model_results['Alpha'].append(a)
        # model_results['Beta'].append(b)
        model_results["Coherence"].append(cv)


coher = pd.DataFrame(model_results)

In [None]:
# coher.to_csv("lda_tuning_results.csv", index=False)

## Load best models

In [None]:
# LDA with bag of words
lda_model = gensim.models.LdaMulticore(
    bow_corpus, num_topics=14, id2word=dictionary, passes=2, workers=4
)

# lda_model.save("lda_model")

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))

In [None]:
# Running LDA using TF-IDF (best model)
lda_model_tfidf = gensim.models.LdaMulticore(
    corpus_tfidf, num_topics=12, id2word=dictionary, passes=2, workers=4
)

# lda_model_tfidf.save("lda_model_tfidf")

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))

## **Evaluation**

In [None]:
# coherence plot

figure(figsize=(10, 5), dpi=80)
coherence_bow = coher[coher["Corpus_Type"] != "TF-IDF"]
topic_n = coherence_bow["Topics"]
coherence_tfidf = coher[coher["Corpus_Type"] == "TF-IDF"]
plt.plot(topic_n, coherence_bow["Coherence"], label="Bow of Words")
plt.plot(topic_n, coherence_tfidf["Coherence"], label="TF-IDF")
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend()
plt.show()

In [None]:
vis = gensimvis.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)