Importing necessary Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.graph_objs as go
import plotly as py
import calendar
import re
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
import string
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import preprocess_string
from gensim.models.coherencemodel import CoherenceModel
from collections import OrderedDict
import pyLDAvis.gensim

Importing Data

In [None]:
import pandas as pd
tweets_df = pd.read_csv("../input/elon-musks-tweets/data_elonmusk.csv",encoding='latin1')

Overview of Dataset

In [None]:
tweets_df.head()

We need only "Tweet" column to do our topic modelling analysis.However, I will keep "Time" column as well to show the tweet counts by months

In [None]:
tweets_df=tweets_df.drop(['row ID','Retweet from','User'],axis=1)

In [None]:
tweets_df.head()

Converting "Time" column to datetime column

In [None]:
tweets_df['Time']=pd.to_datetime(tweets_df['Time'])

In [None]:
tweets_df['Time']=pd.to_datetime(tweets_df['Time'],format='%y-%m-%s %H:%M:%S')

Showing date column as Year-Month combination

In [None]:
tweets_df['Time']=pd.to_datetime(tweets_df['Time']).dt.to_period('M')

In next few lines, I am trying to convert the date time to year and month name just in case we need for any visualization purpose

In [None]:
tweets_df['Time']=pd.DataFrame(tweets_df['Time'].astype(str))

In [None]:
tweets_df['Month']=tweets_df['Time'].apply(lambda x:x.split('-')[1]).astype(int)

In [None]:
tweets_df['Year']=tweets_df['Time'].apply(lambda x:x.split('-')[0])

In [None]:
tweets_df['Month']=tweets_df['Month'].apply(lambda x:calendar.month_name[x])

In [None]:
tweets_df['Year_month']=tweets_df['Year'].astype(str)+tweets_df['Month'].astype(str)

In [None]:
tweets_df=tweets_df.drop(['Month','Year','Time'],axis=1)

In [None]:
tweets_df.head()

Let's start with cleaning our Tweet Column.
We will try to remove "@",userhandle id ,emoticons,RT signs,hyperlinks

In [None]:
HANDLE='@\w+'
LINK ='https://t\.co/\w+'



In [None]:
def basic_clean(text):
    text=re.sub(HANDLE,"",text)
    text=re.sub(LINK,"",text)
    
    return text

In [None]:
tweets_df['clean_tweet']=tweets_df['Tweet'].apply(lambda x:basic_clean(x))

In [None]:
tweets_df.head()

splitting the "clean_tweet" columns into tokens as well as basic text preprocessing e.g. stopword removal / lemmatization/spelling correction

In [None]:
stops=stopwords.words('english')

In [None]:
tweets_df['clean_tweet']=tweets_df['clean_tweet'].apply(lambda x:" ".join(word.lower() for word in x.split() if word not in stops))

In [None]:
tweets_df['clean_tweet']=tweets_df['clean_tweet'].apply(lambda x:" ".join(Word(word).lemmatize() for word in x.split()))

In [None]:
retweet=['RT','rt','http']

In [None]:
punc=[string.punctuation]+retweet

In [None]:
tweets_df['clean_tweet']=tweets_df['clean_tweet'].apply(lambda x:" ".join(word for word in x.split() if word not in punc))

Let's check our tweet column after basic cleaning

In [None]:
tweets_df.head()

Let's implement the LDA model from Gensim

In [None]:
tweets=tweets_df['clean_tweet'].apply(preprocess_string).tolist()

In [None]:
tweets

In [None]:
dictionary=corpora.Dictionary(tweets)

In [None]:
corpus=[dictionary.doc2bow(text) for text in tweets]

In [None]:
NUM_TOPICS=5
lda=LdaModel(corpus,num_topics=NUM_TOPICS,id2word=dictionary,passes=15)

In [None]:
lda.print_topics(num_words=6)

In order to decide on the correct number of topics, we will need a way to assess how well the model's topics were chosen. Gensim provides a CoherenceModel instance that you can use

In [None]:
def calculate_coherence_score(tweets,dictionary,lda):
    coherence_model=CoherenceModel(model=lda,texts=tweets,dictionary=dictionary,coherence='c_v')
    return coherence_model.get_coherence()

In [None]:
def get_coherence_values(start,stop):
    for num_topics in range(start,stop):
        print(f'\nCalculating coherence for {num_topics} topics')
        lda=LdaModel(corpus,num_topics=num_topics,id2word=dictionary,passes=2)
        coherence=calculate_coherence_score(tweets,dictionary,lda)
        yield coherence

In [None]:
min_topics,max_topics=10,30
coherence_score=list(get_coherence_values(min_topics,max_topics))

Plotting coherence score against number of topics to identify best topic number

In [None]:
x=[int(i) for i in range(10,30)]
plt.plot(x,coherence_score)

Highest coherence is at topic number 27

In [None]:
data={i:OrderedDict(lda.show_topic(i,27)) for i in range (NUM_TOPICS)}

In [None]:
data=pd.DataFrame(data)
data=data.fillna(0).T

In [None]:
print(data)

In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(lda,corpus,dictionary, mds='tsne')
panel