# Topic Modeling of Twitter Support Discussions with LDA

Almost 3 million tweets (2.811.774)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Load and filter the dataset

In [None]:
# load dataset

documents = pd.read_csv('/kaggle/input/customer-support-on-twitter/twcs/twcs.csv', error_bad_lines=False)
print(documents[:5])

In [None]:
documents = documents[['text']]
documents['index'] = documents.index
print(len(documents))
print(documents[:5])

## Preprocessing

For preprocessing I used:
* Gensim's simple_preprocess for word tokenization
* Gensim's english stopwords for stopword removing
* NLTK's WordNetLemmatizer

and some other dataset-specific processings like link and mention removing.

In [None]:
# preprocessing imports

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import re
np.random.seed(2018)

# preprocessing functions

def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    if type(text) is not str:
        return
    result = []
    text = re.sub('http://\S+|https://\S+', '', text)  # remove urls
    text = re.sub('@\S+', '', text)  # remove mentions
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize(token))
    return result

doc_sample = documents[documents['index'] == 99].values[0][0]
print('original document: ', doc_sample)
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
# preprocess text

processed_docs = documents['text'].map(preprocess)
processed_docs[:10]

## Model Preparation
Create a dictionary of all the docs for word-id mapping and filter out words that appear almost never and almost everywhere.

In [None]:
# create dictionary

dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
        
print('\n\n')
dictionary.filter_extremes(no_below=5, no_above=0.7, keep_n=100000)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

Create a bag of words to map tokens and their frequency for each document. We will use this to create a TF*IDF ranking for the LDA model.

In [None]:
# create bow

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[:10]

In [None]:
# describe bow

bow_doc_first = bow_corpus[0]
for i in range(len(bow_doc_first)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_first[i][0], dictionary[bow_doc_first[i][0]], bow_doc_first[i][1]))

Create the TF*IDF model over the BOW.

In [None]:
# create tf*idf model

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

## Run and test the model
Finally, we use the TF*IDF model to train our LDA model.

In [None]:
# run lda with tf*idf

from gensim.models import CoherenceModel

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=8, id2word=dictionary, passes=2, workers=4)

# compute perplexity for this model
print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus_tfidf))

# compute coherence for this model
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
print('\nCoherence Score: ', coherence_model_lda.get_coherence())

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# check lda with tf*idf accuracy
print(processed_docs[0])

print('\n\nResults of lda with tf*idf')
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
# test with unseen document

unseen_document = 'I need help with my account'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

## Model visualization

In [None]:
# wordclouds

topics = lda_model_tfidf.show_topics(formatted=False)

from matplotlib import pyplot as plt
from wordcloud import WordCloud
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=STOPWORDS,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

fig, axes = plt.subplots(2, 4, figsize=(80,80), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=100))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
# visualization of the model

%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare(topic_model=lda_model_tfidf, corpus=corpus_tfidf, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)