# Pfizer Vaccine Tweets Analysis

In [None]:
# Code to Hide the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

%matplotlib inline

import re
from nltk import word_tokenize, corpus
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS

---
**1. Importing the Data as a Pandas Dataframe**

In [None]:
vaccine = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')

---
**2. Calling the function head(), info() and describe() to get a basic idea on what we are dealing with. And also displaying the count of missing values in each Column.**

In [None]:
vaccine.head()

In [None]:
vaccine.info()

In [None]:
vaccine.describe()

In [None]:
vaccine.isna().sum()

In [None]:
#Filling in the Missing Values

vaccine['user_location'].fillna("Unknown", inplace=True)
vaccine['user_description'].fillna("No Description", inplace=True)
vaccine['hashtags'].fillna("No Hashtags", inplace=True)
vaccine['source'].fillna("Twitter Web App", inplace=True) #Since Twitter Web App was the most Used

---
**3. Bar Plot time**

**I. Bar plot look at the top most locations from where the tweets were tweeted**

In [None]:
plt.figure(figsize=(16,6))

#Using [1:20], because at 0th Position we have Location "Unknown"
sns.barplot(vaccine["user_location"].value_counts().index[1:20], vaccine["user_location"].value_counts().values[1:20]);

plt.title("Top 20 Locations", fontsize=15)

plt.xlabel("Locations", fontsize=15)
plt.xticks(rotation=60)

plt.ylabel("Total Number of Tweets", fontsize=15)

plt.show()

**As we can see it wasn't much helpful since the 'user_location' doesn't contain only country's name, some of them also have state's and city's name which will be considered as a completely different location. And some of them are not even locations like "Email:talksavailable_at_gmail.com"**

**But we have rought idea about which City/State/Country is more involved in knowing and sharing their opinion on the vaccine**

---

**II. Top 20 Most Popular Users**

In [None]:
plt.figure(figsize=(16,10))

followers_count = list(vaccine.sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:20].user_followers)
user_name = list(vaccine.sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:20].user_name)

sns.barplot(followers_count, user_name) 
            
plt.title("Top 20 Users", fontsize=15)

plt.xlabel("Followers", fontsize=15)

plt.ylabel("User Name", fontsize=15)

plt.show()

**4. Word Cloud Time**

In [None]:
# To check and add only those words which belong to English and also to prevent adding misspelled words
english_words = set(corpus.words.words())

**I. For User Descriptions**

In [None]:
word_tokens = [word_tokenize(text) for text in vaccine.user_description]

user_description_string = ""

for word_list in word_tokens:
    for word in word_list:
        if word.lower() in english_words:
            user_description_string += word + " "
        
#Replacing all the special Characters with Space
user_description_string = re.sub('[^a-zA-Z0-9\n\.]', ' ', user_description_string)

# Updating some of the Words into Stopwords 
description_stopwords = set(STOPWORDS)
description_stopwords.update(["t", "co", "https", "description","descript", "view", "new", 
                              "twitter", "feed", "tweet", "official"])

my_word_cloud = WordCloud(background_color='white',stopwords=description_stopwords).generate(user_description_string)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Word Cloud for User Description", fontsize=30)
plt.axis('off')
plt.show()

**II. For Hashtags**

In [None]:
word_tokens = [word_tokenize(text) for text in vaccine.hashtags]

hashTagString = ""

for word_list in word_tokens:
    for word in word_list:
        hashTagString += word + " "
        
hashTagString = re.sub('[^a-zA-Z0-9\n\.]', ' ', hashTagString)

my_word_cloud = WordCloud(background_color='white',stopwords=None).generate(hashTagString)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Word Cloud for Hashtags", fontsize=30)
plt.axis('off')
plt.show()

**III. For User Text**

In [None]:
word_tokens = [word_tokenize(text) for text in vaccine.text]

textString = ""

for word_list in word_tokens:
    for word in word_list:
        if word.lower() in english_words:
            textString += word.lower() + " "
        
textString = re.sub('[^a-zA-Z0-9\n\.]', ' ', textString)

# Adding most obvious words into Stopwords like vaccine, Covid, pfizer etc...because almost all tweets will have them since
# this collection of tweets are focused on that particular Vaccine
text_stopwords = set(STOPWORDS)
text_stopwords.update(["t", "co", "https", "first", "pfizer", "covid", "covid19", "pfizerbiontech", "vaccine",
                      "covidvaccin", "s", "u", "vaccination"])

my_word_cloud = WordCloud(background_color='white',stopwords=text_stopwords).generate(textString)
plt.figure(figsize=(10,20))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Word Cloud for Tweets", fontsize=30)
plt.axis('off')
plt.show()

---
**5. Pie Chart Time**

**I. Proportion of verified and non-verified users**

In [None]:
verified_labels = ['Yes', 'No']
verified_count = [len(vaccine[vaccine['user_verified']]), len(vaccine[~vaccine['user_verified']])]
plt.figure(figsize=(6,6))
plt.title("User Account is Verified")
plt.pie(verified_count,labels = verified_labels,autopct='%1.2f%%')
plt.show()

**We can see that most of the users are not Verified.**

---
**II. Is a Re-tweet**

In [None]:
isRetweet_labels = ['Yes', 'No']
isRetweet_count = [len(vaccine[vaccine['is_retweet']]), len(vaccine[~vaccine['is_retweet']])]
plt.figure(figsize=(6,6))
plt.title("Is retweet or not")
plt.pie(isRetweet_count,labels = isRetweet_labels,autopct='%1.2f%%')
plt.show()

**So, there are no Retweets**

---
**III. Source used to tweet**

In [None]:
most_commonly_used_source = ['Twitter Web App', 'Twitter for iPhone', 'Twitter for Android', 
                             'TweetDeck', 'Twitter for iPad']

source_refined_dataframe = vaccine['source'].copy(deep=True)

for i in range(len(source_refined_dataframe)):
    if source_refined_dataframe.at[i] not in most_commonly_used_source:
        source_refined_dataframe.at[i] = "Others"

source_labels = source_refined_dataframe.value_counts().index
source_count = source_refined_dataframe.value_counts().values
plt.figure(figsize=(6,6))
plt.title("Most Popular Source For Tweeting")
plt.pie(source_count,labels = source_labels,autopct='%1.2f%%')
plt.show()

**6. Now Lets get in the Sentiment Analysis Part**

**I. Text Length Distribution Using Histogram**

In [None]:
text_length = [len(word_tokenize(text)) for text in vaccine.text]
plt.figure(figsize=(8,8))
sns.distplot(text_length, bins=20)
plt.title("Distribution of Text Length")
plt.xlabel("No. of Word in Text")
plt.axvline(np.mean(text_length), color='red', ls="--", label="Mean:{}".format(round(np.mean(text_length),2)))
plt.axvline(np.median(text_length), color='blue', ls='--', label="Median:{}".format(round(np.median(text_length))))
plt.legend(loc=0)
plt.show()

**From this we can see that most of text contains a total word of somewhere around 20-28.**

---
**II. Using Vader Sentiment Analysis lets See if the Texts are Positive, Negative or Neutral**

In [None]:
!pip install vaderSentiment

In [None]:
#Preprocessing Function
def preprocessTweets(tweet):
    tweet = re.sub(r"http\S+", "", tweet) #Removing the Hyperlinks
    tweet = tweet.replace("#", "") #Removing the Hashtags
    return tweet

#Adding a new Column to store the processed Text
vaccine['cleaned_text'] = vaccine['text'].apply(lambda x:preprocessTweets(x))

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

#Storing the Sentiment score in a Column called sentiment
vaccine['score'] = vaccine['cleaned_text'].apply(lambda x:sia.polarity_scores(x)['compound'])

def getSentimentType(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    return 'Neutral'

#Based the Sentiment Score Replacing the Score of either Positive, Negative or Neutral
vaccine['sentiment'] = vaccine['score'].apply(lambda x:getSentimentType(x))

**III. Value Counts for Setiment**

In [None]:
plt.figure(figsize=(6,6))

order=["Positive", "Neutral", "Negative"]

sentiment_count = list(vaccine['sentiment'].value_counts(sort=False).values)
sentiment_values = list(vaccine['sentiment'].value_counts(sort=False).index)

sns.barplot(sentiment_values, sentiment_count, order=order) 
            
plt.title("Sentiment Value Counts", fontsize=15)

plt.xlabel("Sentiment", fontsize=15)

plt.ylabel("Value Count", fontsize=15)

plt.show()

**IV. Most Followed Users with Different Type of Sentiment**

A. Top 10 Users with Positive Sentiment

In [None]:
plt.figure(figsize=(16,6))

followers_count = list(vaccine[vaccine['sentiment']=='Positive'].sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:10].user_followers)
user_name = list(vaccine[vaccine['sentiment']=='Positive'].sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:10].user_name)

sns.barplot(followers_count, user_name, color='g') 
            
plt.title("Top 10 Users with Positive Sentiment", fontsize=15)

plt.xlabel("Followers", fontsize=15)

plt.ylabel("User Name", fontsize=15)

plt.show()

B. Top 10 Users with Neutral Sentiment

In [None]:
plt.figure(figsize=(16,6))

followers_count = list(vaccine[vaccine['sentiment']=='Neutral'].sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:10].user_followers)
user_name = list(vaccine[vaccine['sentiment']=='Neutral'].sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:10].user_name)

sns.barplot(followers_count, user_name, color='y') 
            
plt.title("Top 10 Users with Neutral Sentiment", fontsize=15)

plt.xlabel("Followers", fontsize=15)

plt.ylabel("User Name", fontsize=15)

plt.show()

C. Top 10 Users with Negative Sentiment

In [None]:
plt.figure(figsize=(16,6))

followers_count = list(vaccine[vaccine['sentiment']=='Negative'].sort_values('user_followers', ascending=False)\
                              .drop_duplicates(subset=['user_name'])[:10].user_followers)
user_name = list(vaccine[vaccine['sentiment']=='Negative'].sort_values('user_followers', ascending=False)\
                        .drop_duplicates(subset=['user_name'])[:10].user_name)

sns.barplot(followers_count, user_name, color='r') 
            
plt.title("Top 10 Users with Negative Sentiment", fontsize=15)

plt.xlabel("Followers", fontsize=15)

plt.ylabel("User Name", fontsize=15)

plt.show()

In [None]:
total_positive_followers = vaccine[(vaccine['sentiment']=='Positive') & vaccine['user_verified']]\
                                  .sort_values('user_followers', ascending=False)\
                                  .drop_duplicates(subset=['user_name']).user_followers.sum()

total_neutral_followers = vaccine[(vaccine['sentiment']=='Neutral') & vaccine['user_verified']]\
                                  .sort_values('user_followers', ascending=False)\
                                  .drop_duplicates(subset=['user_name']).user_followers.sum()

total_negative_followers = vaccine[(vaccine['sentiment']=='Negative') & vaccine['user_verified']]\
                                  .sort_values('user_followers', ascending=False)\
                                  .drop_duplicates(subset=['user_name']).user_followers.sum()

total_followers = total_positive_followers + total_neutral_followers + total_negative_followers

verified_labels = ['Positive', 'Neutral', 'Negative']

verified_count = [total_positive_followers / total_followers * 100,
                  total_neutral_followers / total_followers * 100,
                  total_negative_followers / total_followers * 100,]

plt.figure(figsize=(6,6))
plt.title("Proportion of Users Following")
plt.pie(verified_count,labels = verified_labels,autopct='%1.2f%%')
plt.show()

**Yes there are duplicate followers in the above pie chart since a User can be following multiple Twitter Accounts**

**And we can see that the Sources of Negative Sentiments do not have much followers.**

# Thank You