# Topic Modelling

In this notebook, I will try to classify their tweets under different topics to analyse the different headings the politcians talk about

In [21]:
import pandas as pd
import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

import gensim
from gensim import corpora
import pickle

import spacy
from spacy.lang.en import English

import random


### Read the clean data

In [12]:
trump = pd.read_csv("data/DonaldTrumpClean.csv")
obama = pd.read_csv("data/BarackObamaClean.csv")
clinton = pd.read_csv("data/HillaryClintonClean.csv")

### Start Modelling

In [24]:
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')        
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [25]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

### Get stopwords

In [31]:
stop_words = set(stopwords.words('english'))
stop_words.update(['@realdonaldtrump', '@realdonaldtrump.', '.@realdonaldtrump', 'trump', 'donald', 'Donald trump'])

### Prepare text for LDA

In [32]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

### Append all text in 1 list

In [33]:
trumpTweetList = list(trump.text)
text_data = []
for line in trumpTweetList:
    tokens = prepare_text_for_lda(line)    
    text_data.append(tokens)

### Create dictionary. Store corpus using pickle

In [34]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

### Generate LDA model

In [39]:
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [38]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.038*"great" + 0.014*"america" + 0.010*"country" + 0.008*"world" + 0.008*"thank"')
(1, '0.008*"people" + 0.007*"global" + 0.007*"warming" + 0.005*"change" + 0.005*"entrepreneur"')
(2, '0.035*"president" + 0.016*"would" + 0.013*"obama" + 0.013*"think" + 0.009*"please"')


### Create a topic interpretation map using pyLDAvis

In [41]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

<div class="alert alert-success">
**Insights** <br/>
As evident from the term frequency in topic 3, the topic Trump talks about most, is `President Obama`. <br/>
The second most frequented topic (Topic 1) is about `Making America Great`, and `Global warming`. <br/>
The third most frequented topic (Topic 2) is about `Entrepreneuship`, `success`, and being `positive`. <br/>
</div>

Reference: https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21