In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the Required Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import nltk
import gensim
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from gensim import corpora
from nltk.tokenize import TweetTokenizer
from wordcloud import WordCloud
from string import punctuation
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.colors as mcolors
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
tweets = pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv')

#Cleaning Tweets
stop_nltk= stopwords.words("english")
stop_updated= stop_nltk+["https",'t.co','...']
lemm= WordNetLemmatizer()
tweet_tok=  TweetTokenizer()

def clean_text(text):
    tokens= tweet_tok.tokenize(text.lower())
    tokens1 = [re.sub(r'^https://t.co/[\w]+','',tok) for tok in tokens]
    tokens2 = [re.sub(r'^@[\w]+','',tok) for tok in tokens1]
    tokens3 = [re.sub(r'^#[\w]+','',tok) for tok in tokens2]
    tokens4 = [re.sub(r'[0-9]+','',tok) for tok in tokens3]
    lemmed=[lemm.lemmatize(term) for term in tokens4 if term not in  stop_updated and term not in list(punctuation) and len(term)>2]
    res=' '.join(lemmed)
    return res

tweets['Clean_Tweets']= tweets['text'].apply(clean_text)

## Topic Modeling

In [None]:
clean_txt = [text.split() for text in tweets['Clean_Tweets'].to_list()]

# Creating the term dictionary of corpus
dictionary = corpora.Dictionary(clean_txt)

#Creating DTM
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_txt]

# Running and Trainign LDA model on the document term matrix.
ldamodel = LdaModel(doc_term_matrix, num_topics=3, id2word = dictionary,
                    passes=50, 
                    random_state=42)
ldamodel.print_topics(num_words=20)

In [None]:
def get_lda_topics(model, num_topics):
    word_dict = {};
    for i in range(num_topics):
        words = model.show_topic(i, topn = 10);
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words];
    return pd.DataFrame(word_dict)

# Getting Top 10 words in each Document
get_lda_topics(ldamodel, 3)

## Visualizing Topic Clusters

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary=ldamodel.id2word)
vis

## Calculating Perplexity & Coherence

In [None]:
# Compute Perplexity
print('Perplexity: ', ldamodel.log_perplexity(doc_term_matrix))

# Compute Coherence
coherence_model_lda = CoherenceModel(model=ldamodel, texts=clean_txt, dictionary=ldamodel.id2word, coherence='c_v')
with np.errstate(invalid='ignore'):
    lda_score = coherence_model_lda.get_coherence()
print('Coherence Score: ', lda_score)

## Dominant Topic Analysis

In [None]:
def format_topics_sentences(ldamodel=None, corpus=doc_term_matrix, texts=clean_txt):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=doc_term_matrix, texts=clean_txt)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
a = pd.DataFrame(df_dominant_topic['Dominant_Topic'].value_counts())
sns.barplot(a.index,a.Dominant_Topic, palette='inferno')
plt.title("Dominant Topic across Dataset")
plt.show()

## Frequency Distribution of Word Counts in Documents

In [None]:
doc_lens = [len(d) for d in df_dominant_topic.Text]

# Plot
plt.figure(figsize=(14,7))
plt.hist(doc_lens, bins = 50, color='navy')
plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(750,  90, "Median : " + str(round(np.median(doc_lens))))
plt.text(750,  80, "Stdev   : " + str(round(np.std(doc_lens))))
plt.text(750,  70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(750,  60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(xlim=(0, 50), ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0,50,4))
plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.show()

## Top words in each Topic Cluster

In [None]:
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=20,
                  colormap='Spectral',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = ldamodel.show_topics(formatted=False)

fig, axes = plt.subplots(1, 3, figsize=(15,15), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

## Topics wise Sentiment Analysis


In [None]:
# Conducting Sentiment Analysis to draw further Inference
analyser= SentimentIntensityAnalyzer() 

def get_vader_sentiment(sent):
    return analyser.polarity_scores(sent)['compound']

tweets['sent_vader']= tweets['Clean_Tweets'].apply(get_vader_sentiment)

sent_pred=[]
for i in range(len(tweets['sent_vader'])):
    if tweets['sent_vader'][i]>=0.05:
        sent_pred.append('positive')
    elif tweets['sent_vader'][i]<=-0.05:
        sent_pred.append('negative')
    else:
        sent_pred.append('neutral')  

tweets['sent_pred']= sent_pred
merge1 = pd.concat([df_dominant_topic,tweets[['sent_pred','sent_vader']]],axis=1)
m1 = pd.DataFrame(merge1.groupby(['Dominant_Topic','sent_pred'])['Text'].count())

cross_tab= pd.crosstab(index=merge1['Dominant_Topic'],columns=merge1['sent_pred'])
cross_tab.plot(kind='bar', figsize=(10,7), colormap="viridis")
plt.xticks(rotation=0)
plt.title("Sentiments in Dominant Topics")
ax= plt.subplot(111)
ax.legend(loc="upper center",bbox_to_anchor=(00.5,1.00),ncol=4,fancybox=True,shadow=True)

## Conclusion

* Analysing the Topic Cluster, 
  - Cluster-1 has a positive outlook with terms such as 'grateful', 'good', 'thanks' & 'received' 
  - Cluster-2 has a concerned outlook with terms such as 'emergency', 'injection', 'sore' & 'health'
  - Cluster-3 has a negative outlook with terms such as 'ban', 'red', 'death', 'mutation', 'protect' & 'please'
  
*  Most Tweets belong to Cluster-1, Cluster-2 & Cluster-3 (descending order) 

* It is seen that percentage of positive sentiment tweet is highest in Cluster-1 justifying the positive outlook in the Tweets topics

* In Cluster-2, Neutral statement Tweets are highest

* In Cluster-3, it is seen thet in comparison to other two Topic Clusters, the number or count of Negative sentiment Tweets in highest Cluster -3 with 207 tweets