# Vader Sentimental Analysis on Pfizer Covid19 Vaccine Tweets

## Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

%matplotlib inline

import re
from nltk import word_tokenize, corpus
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS

### Importing the dataset

In [None]:
vaccine = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')

### About the Dataset

In [None]:
vaccine.head()

In [None]:
vaccine.info()

In [None]:
vaccine.describe()

### Imputing missing values

In [None]:
vaccine.isna().sum()

In [None]:
#Filling in the Missing Values with most common occurence

vaccine['user_location'].fillna("Unknown", inplace=True)
vaccine['user_description'].fillna("No Description", inplace=True)
vaccine['hashtags'].fillna("No Hashtags", inplace=True)
vaccine['source'].fillna("Twitter Web App", inplace=True)

## EDA

### I) Tweets from Top 20 Users with most followers 

In [None]:
plt.figure(figsize=(16,10))
followers_count = list(vaccine.sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:20].user_followers)
user_name = list(vaccine.sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:20].user_name)
sns.barplot(followers_count, user_name) 
plt.title("Top 20 Users", fontsize=15)
plt.xlabel("Followers", fontsize=15)
plt.ylabel("User Name", fontsize=15)
plt.show()

### II) Word Clouds

In [None]:
# Filtering misspelled words and words from other languages
english_words = set(corpus.words.words())

#### 1) User Descriptions

In [None]:
word_tokens = [word_tokenize(text) for text in vaccine.user_description]

user_description_string = ""

for word_list in word_tokens:
    for word in word_list:
        if word.lower() in english_words:
            user_description_string += word + " "
        
#Replacing all the special Characters with Space
user_description_string = re.sub('[^a-zA-Z0-9\n\.]', ' ', user_description_string)

# Adding some of the Words into Stopwords 
description_stopwords = set(STOPWORDS)
description_stopwords.update(["t", "co", "https", "description","descript", "view", "new", 
                              "twitter", "feed", "tweet", "official"])

my_word_cloud = WordCloud(background_color='white',stopwords=description_stopwords).generate(user_description_string)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("User Description Word Cloud", fontsize=30)
plt.axis('off')
plt.show()

#### 2) Hashtags

In [None]:
word_tokens = [word_tokenize(text) for text in vaccine.hashtags]

hashTagString = ""

for word_list in word_tokens:
    for word in word_list:
        hashTagString += word + " "
        
hashTagString = re.sub('[^a-zA-Z0-9\n\.]', ' ', hashTagString)

my_word_cloud = WordCloud(background_color='white',stopwords=None).generate(hashTagString)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Hashtags Word Cloud", fontsize=30)
plt.axis('off')
plt.show()

#### 3) Tweet Text

In [None]:
word_tokens = [word_tokenize(text) for text in vaccine.text]

textString = ""

for word_list in word_tokens:
    for word in word_list:
        if word.lower() in english_words:
            textString += word.lower() + " "
        
textString = re.sub('[^a-zA-Z0-9\n\.]', ' ', textString)

# Adding most obvious words into Stopwords like vaccine, Covid, pfizer etc.
text_stopwords = set(STOPWORDS)
text_stopwords.update(["t", "co", "https", "first", "pfizer", "covid", "covid19", "pfizerbiontech", "vaccine",
                      "covidvaccin", "s", "u", "vaccination"])

my_word_cloud = WordCloud(background_color='white',stopwords=text_stopwords).generate(textString)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Tweets Word Cloud", fontsize=30)
plt.axis('off')
plt.show()

### III) Pie Charts

#### 1) Verified vs Non-Verified Twitter Accounts

In [None]:
verified_labels = ['Yes', 'No']
verified_count = [len(vaccine[vaccine['user_verified']]), len(vaccine[~vaccine['user_verified']])]
plt.figure(figsize=(6,6))
plt.title("User Account is Verified")
plt.pie(verified_count,labels = verified_labels,autopct='%1.2f%%')
plt.show()

#### 2) is a Retweet

In [None]:
isRetweet_labels = ['Yes', 'No']
isRetweet_count = [len(vaccine[vaccine['is_retweet']]), len(vaccine[~vaccine['is_retweet']])]
plt.figure(figsize=(6,6))
plt.title("Is retweet or not")
plt.pie(isRetweet_count,labels = isRetweet_labels,autopct='%1.2f%%')
plt.show()

#### 3) Sources of Tweets

In [None]:
most_commonly_used_source = ['Twitter Web App', 'Twitter for iPhone', 'Twitter for Android', 
                             'TweetDeck', 'Twitter for iPad']

source_refined_dataframe = vaccine['source'].copy(deep=True)

for i in range(len(source_refined_dataframe)):
    if source_refined_dataframe.at[i] not in most_commonly_used_source:
        source_refined_dataframe.at[i] = "Others"

source_labels = source_refined_dataframe.value_counts().index
source_count = source_refined_dataframe.value_counts().values
plt.figure(figsize=(6,6))
plt.title("Most Popular Sources For Tweeting")
plt.pie(source_count,labels = source_labels,autopct='%1.2f%%')
plt.show()

## Sentimental Analysis

### 1) Distribution of Text Length Using Histogram

In [None]:
text_length = [len(word_tokenize(text)) for text in vaccine.text]
plt.figure(figsize=(8,8))
sns.distplot(text_length, bins=20)
plt.title("Distribution of Text Length")
plt.xlabel("No. of Word in Text")
plt.axvline(np.mean(text_length), color='red', ls="--", label="Mean:{}".format(round(np.mean(text_length),2)))
plt.axvline(np.median(text_length), color='blue', ls='--', label="Median:{}".format(round(np.median(text_length))))
plt.legend(loc=0)
plt.show()

### 2) Applying VADER Sentimental Analysis

In [None]:
!pip install vaderSentiment

In [None]:
#Preprocessing Function
def preprocessTweets(tweet):
    tweet = re.sub(r"http\S+", "", tweet) #Removing the Hyperlinks
    tweet = tweet.replace("#", "") #Removing the Hashtags
    return tweet

#New Column to store the processed Text
vaccine['cleaned_text'] = vaccine['text'].apply(lambda x:preprocessTweets(x))

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

#Storing the Sentiment score in a Column called sentiment
vaccine['score'] = vaccine['cleaned_text'].apply(lambda x:sia.polarity_scores(x)['compound'])

def getSentimentType(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    return 'Neutral'

#Based the Sentiment Score Replacing the Score of either Positive, Negative or Neutral
vaccine['sentiment'] = vaccine['score'].apply(lambda x:getSentimentType(x))

### 3) Value Counts for Sentiment

In [None]:
plt.figure(figsize=(6,6))

order=["Positive", "Neutral", "Negative"]

sentiment_count = list(vaccine['sentiment'].value_counts(sort=False).values)
sentiment_values = list(vaccine['sentiment'].value_counts(sort=False).index)

sns.barplot(sentiment_values, sentiment_count, order=order) 
            
plt.title("Sentiment Value Counts", fontsize=15)

plt.xlabel("Sentiment", fontsize=15)

plt.ylabel("Value Count", fontsize=15)

plt.show()

### 4) Most Followed Users with Different Type of Sentiment

A. Top 10 Users with Positive Sentiment

In [None]:
plt.figure(figsize=(16,6))

followers_count = list(vaccine[vaccine['sentiment']=='Positive'].sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:10].user_followers)
user_name = list(vaccine[vaccine['sentiment']=='Positive'].sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:10].user_name)

sns.barplot(followers_count, user_name, color='g') 
            
plt.title("Top 10 Users with Positive Sentiment", fontsize=15)

plt.xlabel("Followers", fontsize=15)

plt.ylabel("User Name", fontsize=15)

plt.show()

B. Top 10 Users with Neutral Sentiment

In [None]:
plt.figure(figsize=(16,6))

followers_count = list(vaccine[vaccine['sentiment']=='Neutral'].sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:10].user_followers)
user_name = list(vaccine[vaccine['sentiment']=='Neutral'].sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:10].user_name)

sns.barplot(followers_count, user_name, color='y') 
            
plt.title("Top 10 Users with Neutral Sentiment", fontsize=15)

plt.xlabel("Followers", fontsize=15)

plt.ylabel("User Name", fontsize=15)

plt.show()

C. Top 10 Users with Negative Sentiment

In [None]:
plt.figure(figsize=(16,6))

followers_count = list(vaccine[vaccine['sentiment']=='Negative'].sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:10].user_followers)
user_name = list(vaccine[vaccine['sentiment']=='Negative'].sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:10].user_name)

sns.barplot(followers_count, user_name, color='r') 
            
plt.title("Top 10 Users with Negative Sentiment", fontsize=15)

plt.xlabel("Followers", fontsize=15)

plt.ylabel("User Name", fontsize=15)

plt.show()

### Overall Followers Distribution by Sentiment

In [None]:
total_positive_followers = vaccine[(vaccine['sentiment']=='Positive') & vaccine['user_verified']]\
                                  .sort_values('user_followers', ascending=False)\
                                  .drop_duplicates(subset=['user_name']).user_followers.sum()

total_neutral_followers = vaccine[(vaccine['sentiment']=='Neutral') & vaccine['user_verified']]\
                                  .sort_values('user_followers', ascending=False)\
                                  .drop_duplicates(subset=['user_name']).user_followers.sum()

total_negative_followers = vaccine[(vaccine['sentiment']=='Negative') & vaccine['user_verified']]\
                                  .sort_values('user_followers', ascending=False)\
                                  .drop_duplicates(subset=['user_name']).user_followers.sum()

total_followers = total_positive_followers + total_neutral_followers + total_negative_followers

verified_labels = ['Positive', 'Neutral', 'Negative']

verified_count = [total_positive_followers / total_followers * 100,
                  total_neutral_followers / total_followers * 100,
                  total_negative_followers / total_followers * 100,]

plt.figure(figsize=(6,6))
plt.title("Proportion of Users Following")
plt.pie(verified_count,labels = verified_labels,autopct='%1.2f%%')
plt.show()