In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('vader_lexicon')
from nltk.corpus import sentiwordnet as swn
%matplotlib inline
import seaborn as sns
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
data=pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv',na_values='NULL')
data.tail(5)

In [None]:
data.shape

In [None]:
data['text'].iloc[-1]

In [None]:
data.info()

In [None]:
#Check For Duplication in The Tweets Id and User_Name
print('No Of Duplicated Id: {}'.format(str(data.id.duplicated().sum())))
print('No Of unique Users: {}'.format(str(len(data.user_name.unique()))))
print('No Of Unique Descriptions: {}'.format(str(len(data.user_description.unique()))))
print('No Of Unique Tweets: {}'.format(str(len(data.text.unique()))))


      
      

The Above shows that There is A unqiue Id for each tweets. But There are Users with overlapping user_name or  Users who have more than one tweets in the Dataset. We will sample the Duplicated users to gain insight into this

We will work with multiple tweets from same user(s) with the assumption(s) that:
1. Their View(s) about the Vaccine doesnt change with time.
2. For Corporations with multiple tweets we assume their positions on the tweets remain consistent and does not vary with individual interests at different times.

### FEATURE EXPLORATION AND INSIGHTS INTO THE DATA

In [None]:
#User_Created
print('Earliest User Acct Date: {}'.format(str(min(pd.DatetimeIndex(data.user_created)))))
print('-'*25)
print('Latest User Acct Date: {}'.format(str(max(pd.DatetimeIndex(data.user_created)))))

In [None]:
#Date Of Tweet
print('Earliest Tweet Date: {}'.format(str(min(pd.DatetimeIndex(data.date)))))
print('-'*25)
print('Latest Tweet Date: {}'.format(str(max(pd.DatetimeIndex(data.date)))))

In [None]:
#We will Use A HeatMap to Analyze The Correlations Between  Numerical Variables
print('Fig 1.0')
corrmat=data.corr()
mask=np.array(corrmat)
plt.figure(figsize=(10,10))
plt.title('Heatmap Showing Correlations Of Features')
sns.heatmap(corrmat,square=True,annot=True)

The Heatmap in Fig 1.0 shows strong relationship between Retweets and Favourites(Which is Expected). Those who retweets  

#### ANALYZING HASHTAGS

In [None]:
b=data.hashtags[data.hashtags.notnull()].values
hash=[]
for x in b:
    c=re.findall(r'\w+[\$@+]?',x)
    for x in c:
        hash.append(x.lower())
print('Number Of Tweets: {}'.format(str(len(hash))))
hash[-10:]

In [None]:
#Convert Text to Nltk Object
hash=nltk.Text(hash)
#Collocations
hash.collocations()

So we have a list of all the hashtags in the tweets in lowercase.Lets take a look at it next

In [None]:
#Number of Unique Tweets
print('Number of Unique Tweets: {}'.format(str(len(set(hash)))))
print('-'*25)
print('Fig 1.1')
#FreqDist Of Tweets
freq=nltk.FreqDist(hash)
freq.plot(20,cumulative=True)

Fig 1.1 Is a Freq Plot Of the Most Common Tweets. PfizerBiontech hashtag accounts for majority of the tweets. The First Fifteen Most common hashtags are concerned with covid19 and its vaccine. We will use some Reg exp to Clean and Merge Overlapping hashtags.

### SENTIMENT ANALYSIS / OPINION MINING

The absence of Label(s) for the Dataset Makes it  practically impossible to use Supervised Learning algoeithms for studying sentiments and classifying tweets. We will use unsupervised learning methods to analyse our tweets for opinions amd patterns. One of The Ways to implement sentiment(s) analysis using unsupervised learning is the use of LEXICON.

#### LEXICON BASED MODELS

A Lexicon is a well curated dictionary of words. in this case of opinion mining it is a dictionary that contains words asscociated with positive and negative sentiments, Part-of-Speech tags(POS-Tags),Polarity scores,Subjectivity and Objectivity Scores. There are numerous lexicons available but for this analysis we will be using just three of those lexicon, the idea here is to compare and contrast between the resupts of each lexicon. they are namely:
1. Vader Sentiment Analysis(Best suitable for tweets).
2. AFINN Lexicon
3. Sentiword Lexicon (From Wordnet in NLTK Library).

In [None]:
#Cleaning Up the Tweets
tweets_sents=data.text.values
tweets_words=[]
for x in tweets_sents:
    tweets_words.append(re.split(r'\s+',x))
#Lower Case
b=[];c=[]
for x in tweets_words:
    for y in x:
        c.append(y.lower())
    b.append(c);c=[]
tweets_words=b;
tweets_words[:3]

In [None]:
#Remove HTML Links and Words targeting at Someone or Something(Beginning with '@')
c=[];tweets_words=[]
for x in b:
    for y in x:
        if re.search(r'^(https|@)',y): continue
        else:
            c.append(y)
    tweets_words.append(c);c=[]
tweets_sents=[]
#Lemmatize Using WordNetLemmatizer
for x in tweets_words:
    for y in x:
        lemma= nltk.WordNetLemmatizer()
        c.append(lemma.lemmatize(y))
    tweets_sents.append(c);
tweet_words=tweets_sents;c=[];tweets_sents=[]
#Remove Stopwords From The Tweets
stopwords=nltk.corpus.stopwords.words('english')
b=[];c=[]
for x in tweets_words:
    for y in x:
        if y not in stopwords:
            c.append(y)
    b.append(c);c=[]
for x in b:
    tweets_sents.append(' '.join(x))
#Extract Words And Tweets Only From the Data
b=[];c=[]
for x in tweets_sents:
    b.append(re.findall(r'#?[a-zA-Z]+',x))
tweets_words=b;tweets_sents=[]
for x in b:
    tweets_sents.append(' '.join(x))
tweets_sents[:5]

#### VADER LEXICON SENTIMENT ANALYSIS

In [None]:
vadler_scores=[]
for x in tweets_sents:
    analyzer=SentimentIntensityAnalyzer()
    vadler_scores.append(analyzer.polarity_scores(x))
vadler_scores[:5]

In [None]:
vadler_frame=pd.DataFrame(vadler_scores)
sentiments=[]
for x in vadler_frame['compound']:
    if x<0:
        sentiments.append('Negative');
    elif x==0:
        sentiments.append('Neutral');
    elif x>0:
        sentiments.append('Positive');
vadler_frame['Sentiments']=sentiments
vadler_frame['Tweets']=tweets_words
vadler_frame.head(10)

In [None]:
# Function to plot Pie Charts of Sentiments
def piechart(lexicon,data,label):
    plt.figure(figsize=(10,10));
    plt.title(lexicon+' Sentiments Pie Chart',size=15)
    plt.pie(x=data,labels=label,explode=[0.05,0.05,0.05],autopct='%.2f')
    plt.legend(loc='upper right')
    plt.show()


In [None]:
print('Fig 1.2')
piechart('Vadler',vadler_frame.Sentiments.value_counts().values,['Positive','Neutral','Negative'])

Fig 1.2 Above Is a Representation of Sentiments for Our vadler lexicon. It is in represented in percentages. The Threshold for Sentiments was chosen to be 0, this means any tweet with compound score greater than 0 is assumed to be positive. Compound score less than zero is assumed to indicate Negative polarity while Compound score of zero is an indication of Neutrality in the tweet. From fig 1.2 We can see that Positive Sentiments and Neutral Sentiments are closely tied at 41%. Our Vadler Lexicon shows us that there are less Negative sentiments to the #PfizerBiontech Vaccine Compared to the Positive Sentiments. Tweets With Negative sentiments make up for about 16.08% of the whole. 

In [None]:
#Lets Take A peek At our Tweets with Negative Sentiments
vadler_negative_tweets=vadler_frame[vadler_frame.Sentiments=='Negative']
#Sort Them By Negativities in Decreasing Order.
vadler_negative_tweets=vadler_negative_tweets.sort_values(by='neg',ascending=False)
vadler_negative_tweets.head(10)

In [None]:
#ANALYZE TWEETS WITH NEGATIVE VADLER SENTIMENTS TO SEE QHAT PEOPLE ARE TALKING ABOUT
b=vadler_negative_tweets.Tweets.values
neg_tweets=[]
for x in b:
    for y in x:
        neg_tweets.append(y)
# Convert To Nltk Text And Make a Freq Plot Of the words
neg_tweets=nltk.Text(neg_tweets)
freq=nltk.FreqDist(neg_tweets)
print('Fig 1.3')
freq.plot(20,cumulative=True)

Fig 1.3 Is A Cummulative Freqency ditribution of Top  20 Words that are found in the Negative Vaccine Tweets. The most common words are associated with Covid-19 and PfizerBiontech Vacvine. There are certain words of Interest here such as
Khamenei,Ban,Iran,Allergic,Emergency.We will look at these words.  

In [None]:
freq

In [None]:
#Tweets with words Khamenei
neg_tweets.findall(r'<.*><.*><#?khamenei><.*><.*>')

In [None]:
#Tweets with words Ban
neg_tweets.findall(r'<.*><.*><ban><.*><.*>')

In [None]:
#Tweets with words Emergency
neg_tweets.findall(r'<.*><.*><emergency><.*><.*>?')

In [None]:
#Tweets with Words Allergic
neg_tweets.findall(r'<.*><.*><allergic><.*>')


In [None]:
# collocations in Negative Tweets
neg_tweets.collocations()

#### SUMMARY OF VADLER LEXICON ANALYSIS

Vadler Lexicon Fails significantly in identifying Positive tweets from Neutral tweets.Our Major Take from this Lexicon is:
1. A compound score greater than 0 may likely be an indication of a positive sentiment towards the  #PfizerBiontech Vaccine.
2. 17% of our tweets had negative sentiments towards the vaccine.The plot of fig 1.3 shows that the most occuring words in these negative tweets were mostly directly related with the vaccine and covid-19 in genreral.
3. A major Negative sentiment about the PfizerBiontech vaccine according to the vadler lexicon is concerned with the 'Allergic' Reaction of the Vaccine and it adverse effect/reaction. This is the major concern of the people who tweeted negatively about the vaccine. This can be seen form the output of the snippet 'Tweets regarding the word Allergic'.
4. Emergency Use or approval of the vaccine was also a major concern of the people who tweeted. Emergency as a word is associated with Negative Sentiment in the vadler lexicon and has a negative polarity score. But a look at the code snippet above '#Tweets with the word Emergency' shows a sense of elation,excitement and optimism about The approval for emergency use of the vaccine for treatment.'Emergency use' was also a collocation i.e a freqently used term in the tweets.
5. Words such as Ban,Iran and Khamenei were amomg the most used words used in tweets with Negative Sentiments. Analysis of tweets with words such as ban had phrases such as 'Khamenei ban','Ban on Import','Iran Ban'.

### SENTIWORDNET LEXICON

In [None]:
#Part of Speech Tagging of each word
tagged_tweets=[];
for x in tweets_words:
        tagged_tweets.append(nltk.tag.pos_tag(x))
tagged_tweets[:3]

In [None]:
#Implementing The Sentiwordnet Algorithm
senti_frame=[];Total_Count=0
for tags in tagged_tweets:
    pos_score=neg_score=obj_score=Count=0
    for x,y in tags:
        if y.startswith('NN') and len(list(swn.senti_synsets(x,'n')))>0:
            ss_set=list(swn.senti_synsets(x,'n'))[0]
        elif y.startswith('JJ') and len(list(swn.senti_synsets(x,'a')))>0:
            ss_set=list(swn.senti_synsets(x,'a'))[0]
        elif y.startswith('V') and len(list(swn.senti_synsets(x,'v')))>0:
            ss_set=list(swn.senti_synsets(x,'v'))[0]
        elif y.startswith('R') and len(list(swn.senti_synsets(x,'r')))>0:
            ss_set=list(swn.senti_synsets(x,'r'))[0]
        if ss_set:
            pos_score+=ss_set.pos_score()
            neg_score+=ss_set.neg_score()
            obj_score+=ss_set.obj_score()
            Count+=1;Total_Count+=1
    senti_frame.append({'Pos_Score':pos_score,'Neg_Score':neg_score,'Obj_Score':obj_score,'Net_Score':pos_score-neg_score,'Count':Count})
senti_frame=pd.DataFrame(senti_frame)
#Assign Semtiments to each Tweets using 0 as threshold for positive
b=[]
for x in senti_frame['Net_Score']:
    if x<0:
        b.append('Negative')
    elif x==0:
        b.append('Neutral')
    elif x>0:
        b.append('Positive')
senti_frame['Sentiments']=b
senti_frame['Tweets']=tweets_words
senti_frame.head(5)

In [None]:
#PieChart Of Sentiwordnet Lexicon Sentiments
print('Fig 1.4')
piechart('SentiWordNet',senti_frame.Sentiments.value_counts().values,['Positive','Neutral','Negative'])

Fig 1.4 Above shows the Distribution of Sentiments for our sentiwordnet. This composition is dependent on the threshold used in this case the threshold is chosen to be zero. We have more Postive Sentiments aaccounting for more than 50% of the tweets while Negative Sentiments account for the lowest of all sentiments with 22%.

In [None]:
#Positive Tweets Analysis
print('Fig 1.5')
senti_positive_tweets=senti_frame[senti_frame['Sentiments']=='Positive']
b=[x for y in senti_positive_tweets.Tweets for x in y]
pos_tweets=nltk.Text(b)
freq=nltk.FreqDist(pos_tweets)
freq.plot(20,cumulative=True)

In [None]:
#Negative Tweets Analysis
print('Fig 1.6')
senti_negative_tweets=senti_frame[senti_frame['Sentiments']=='Negative']
b=[x for y in senti_negative_tweets.Tweets for x in y]
neg_tweets=nltk.Text(b)
freq=nltk.FreqDist(neg_tweets)
freq.plot(20,cumulative=True)

Fig 1.6 Is a Cumulative plot of Words associated with Negative sentiments. Other than words such as Allergic and Emergency which where detected to common occurence among Negative tweets in our Vadler lexicon analysis, 'Side' is another word that occurs commonly among our tweets and the code snippet below shows that it is asscoiated with the collocation 'Side Effect'.

In [None]:
neg_tweets.findall(r'<.*><.*><side><.*><.*>?')

#### Summary of Sentiwordnet Lexicon Analysis

Our major take from Sentiwordnet Lexicon are as follows:
1. Majority of the tweets (Over 50%) are associated with positive sentiments about the vaccine.
2. Tweets with Negative Sentiments account for 22% of our total tweets.
3. Side Effect of the Vaccine was a major concern of the Tweets with negative sentiments.

### CONCLUSION:


In conclusion we would say that:
1. There is generally Good and Positve Sentiments and Opinions But not much of excitement towards The #Pfizer Biontech vaccine.
2. Emergency Use of the vaccine Was also a major concern. There is a sense of excitement and elation towards the emergency approval of its use.
3. Allergic Reaction(s) and Side Effect(s) were part of major concern for tweets with negative sentiments. This accounts for about 15% of the tweets. It was one of the major issues associated with negative snetimwnts according to our analysis.
4. Words such as 'Ban','Khamenei' and 'Iran' Were associated with Negative sentiments And were major issues of discussion in tweets.