# Importing Modules & Data

In [None]:
import os
import numpy as np 
import pandas as pd 
import re
import matplotlib.pyplot as plt
import plotly.express as px
!pip install neattext
import neattext as ntx
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
path='/kaggle/input/all-covid19-vaccines-tweets/vaccination_all_tweets.csv'
data = pd.read_csv(path)

# Understanding & Preprocessing Data

In [None]:
data.head()

In [None]:
display(data.shape, str(data.shape[0])+" tweets in dataset")   

In [None]:
data.info()

In [None]:
data.isna().sum()

We can see that there is a lot of missing data in user_location, description,sources

In [None]:
data['date'] = pd.to_datetime(data['date']).dt.date  #converting date column to date format
data.head()

In [None]:
# Visulizing Tweet Count vs Location  
plt.figure(figsize=(15,10))
data['user_location'].value_counts().nlargest(20).plot(kind='bar')
plt.xticks(rotation=60)

In [None]:
data=data.drop_duplicates('text')             #dropping duplicate tweets
data.shape

In [None]:
data.source.value_counts()

In [None]:
#Visualizing Tweet Platform-wise Distribution 
plt.figure(figsize=(15,10))
data['source'].value_counts().nlargest(6).plot(kind='bar')
plt.xticks(rotation=80)

In [None]:
len(data['date'].unique())  #Number of days considered

In [None]:
data.sort_values(by=['date'], ascending=[True]).head(2)

It's visible that the dataset has covid vaccine related tweets from 12th December ,2020

In [None]:
data.drop(columns={"id","user_name","user_description","user_created","user_followers",\
                   "user_friends","user_favourites","user_verified","hashtags","source","retweets","favorites","is_retweet"},inplace=True)
# dropping unnecessary 

In [None]:
pd.set_option('display.max_colwidth', 700)
data.head()

In [None]:
# Cleaning the data using neattext library
data['clean_data']=data['text'].apply(ntx.remove_hashtags)
data['clean_data']=data['clean_data'].apply(ntx.remove_urls)
data['clean_data']=data['clean_data'].apply(ntx.remove_userhandles)
data['clean_data']=data['clean_data'].apply(ntx.remove_multiple_spaces)
data['clean_data']=data['clean_data'].apply(ntx.remove_special_characters)

In [None]:
data[['clean_data','text']].head()

# Assigning Polarity & Subjectivity to tweets

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
#stopwords are the words which won't bring about any changes to the polarity of the tweet
stop_words = stopwords.words('english')   
len(stop_words),stop_words[5:10]

In [None]:
# function to remove stopwords
def stopWords(tweet):
  clean_tweet = tweet
  clean_tweet = " ".join(word for word in clean_tweet.split() if word not in stop_words)
# clean_tweet = " ".join(lemmatizer.lemmatize(word) for word in clean_tweet.split())
  return clean_tweet

In [None]:
data['clean_data'] = data['clean_data'].apply(lambda x: stopWords(x))

In [None]:
data.head(2)

In [None]:
from textblob import TextBlob
# Function to assign polarity and subjectivity to the tweets
def blob_fun(text):
  senti = TextBlob(text)
  senti_polarity = senti.sentiment.polarity
  senti_subjectivity = senti.sentiment.subjectivity

  if senti_polarity > 0:
    res = 'Positive'

  elif senti_polarity < 0:
    res = 'Negative'

  elif senti_polarity == 0:
    res ="Neutral"

  result = {'polarity':senti_polarity,'subjectivity':senti_subjectivity,'sentiment':res}

  return result

In [None]:
blob_fun(data['clean_data'][5])

In [None]:
# but this isn't always right as shown in the example below
blob_fun('thank god,i tested negative for covid')

In [None]:
data['results'] = data['clean_data'].apply(blob_fun)

In [None]:
data.drop(columns={"user_location",'text'},inplace=True)

In [None]:
data.head(2)

In [None]:
data = data.join(pd.json_normalize(data=data['results']))

In [None]:

data.head()

In [None]:
# categorized tweets in seperate Series
positive_tweet =  data[data['sentiment'] == 'Positive']['clean_data']
negative_tweet =  data[data['sentiment'] == 'Negative']['clean_data']
neutral_tweet =  data[data['sentiment'] == 'Neutral']['clean_data']

In [None]:
from wordcloud import WordCloud
# Function for creating WordClouds
def cloud_of_Words(tweet_cat,title):
    forcloud = ' '.join([tweet for tweet in tweet_cat])
    wordcloud = WordCloud(width =500,height = 300,random_state =5,max_font_size=110).generate(forcloud)
    plt.imshow(wordcloud, interpolation ='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()
    plt.figure(figsize = (10,8))

In [None]:
plt.figure(figsize = (10,8))
# Creating wordclouds for positive, negative, neutral tweets
cloud_of_Words(positive_tweet, 'Positive')
cloud_of_Words(negative_tweet, 'Negative')
cloud_of_Words(neutral_tweet, 'Neutral')

The above three wordclouds have the similar words as expected because our main analysis is Covid Vaccine

In [None]:
# Breaking down the tweets into words in seperate categories
positive_tokens = [token for line in positive_tweet for token in line.split()]
negative_tokens = [token for line in negative_tweet for token in line.split()]
neutral_tokens = [token for line in neutral_tweet for token in line.split()]

In [None]:
from collections import Counter
# to get most used words
def get_maxtoken(tweets,num=30):
  word_tokens = Counter(tweets)
  max_common = word_tokens.most_common(num)
  return dict(max_common)

In [None]:
def token_df_vis(x, title):
  df = pd.DataFrame(get_maxtoken(x).items(),columns=['words','count'])
  # plt.figure(figsize = (20,5))
  # plt.title(title)
  # plt.xticks(rotation=45)
  fig = px.bar(df,x='words',y='count',title = title)
  fig.show()

In [None]:
token_df_vis(positive_tokens,'Positive')
token_df_vis(negative_tokens,'Negative')
token_df_vis(neutral_tokens,'Neutral')

In [None]:
fig = px.scatter(data,x='polarity',y='subjectivity')
fig.show()

Shows the spread of our tweets on polarity vs subjectivity 

In [None]:
def percent(x,y):
  return print("Percentage of "+y+" tweets :",round(len(x)/data.shape[0]*100,3),"%")

In [None]:
percent(positive_tweet, 'positive')
percent(negative_tweet, 'negative')
percent(neutral_tweet, 'neutral')

In [None]:
data['sentiment'].value_counts().plot(kind='bar')

# Vaccine-wise analysis

In [None]:
data.columns

In [None]:
deep = data.drop(columns="results")
deep.head(2)

In [None]:
# creating reference tags for 5 vaccines -> Pfizer, Covaxin(Bharat Biotech), Sputnik,AstraZenca(Covishield),Moderna
pfizer_refs = ["Pfizer","pfizer","Pfizer–BioNTech","pfizer-bioNtech","BioNTech","biontech"]
bbiotech_refs = ["covax","covaxin","Covax","Covaxin","Bharat Biotech","bharat biotech","BharatBiotech","bharatbiotech"]
sputnik_refs = ["russia","sputnik","Sputnik","V"]
astra_refs = ['sii','SII','adar poonawalla','Covishield','covishield','astra','zenca','Oxford–AstraZeneca','astrazenca','oxford-astrazenca','serum institiute']
moderna_refs = ['moderna','Moderna','mRNA-1273','Spikevax']

In [None]:
def refer(tweet, refs):
  flag =0
  for ref in refs:
    if tweet.find(ref) != -1:
      flag =1
  return flag

deep['pfizer'] = deep['clean_data'].apply(lambda x : refer(x, pfizer_refs))
deep['bbiotech'] = deep['clean_data'].apply(lambda x : refer(x, bbiotech_refs))
deep['sputnik'] = deep['clean_data'].apply(lambda x : refer(x, sputnik_refs))
deep['astra'] = deep['clean_data'].apply(lambda x : refer(x, astra_refs))
deep['moderna'] = deep['clean_data'].apply(lambda x : refer(x, moderna_refs))

In [None]:
display(deep.pfizer.value_counts(),deep.bbiotech.value_counts(),deep.sputnik.value_counts(),deep.astra.value_counts(),deep.moderna.value_counts())

In [None]:
deep[deep['bbiotech']==1].head()    #what the dataset looks like

In [None]:
deep[deep['pfizer']==1].head()

In [None]:
def stats(a,b,c,d,e):
  for i in a,b,c,d,e:
     display(deep[deep[i]==1][[i,'polarity','subjectivity']].groupby(i).agg([np.mean,np.max,np.min,np.median]))

In [None]:
stats('pfizer','bbiotech','sputnik','astra','moderna')

In [None]:
pfizer = deep[deep['pfizer']==1][['date','polarity']]
bbiotech = deep[deep['bbiotech']==1][['date','polarity']]
sputnik = deep[deep['sputnik']==1][['date','polarity']]
astra = deep[deep['astra']==1][['date','polarity']]
moderna = deep[deep['moderna']==1][['date','polarity']]

pfizer = pfizer.sort_values(by='date',ascending=True)
bbiotech = bbiotech.sort_values(by='date',ascending=True)
sputnik = sputnik.sort_values(by='date',ascending=True)
astra = astra.sort_values(by='date',ascending=True)
moderna = moderna.sort_values(by='date',ascending=True)

pfizer['Avg Polarity'] = pfizer.polarity.rolling(20, min_periods=3).mean()
bbiotech['Avg Polarity'] = bbiotech.polarity.rolling(20, min_periods=3).mean()
sputnik['Avg Polarity'] = sputnik.polarity.rolling(20, min_periods=3).mean()
astra['Avg Polarity'] = astra.polarity.rolling(5, min_periods=3).mean()
moderna['Avg Polarity'] = moderna.polarity.rolling(20, min_periods=3).mean()

In [None]:
bbiotech.head(10)

# Visualizing the vaccine Polarity (Moving Average) vs Time

In [None]:
a,b,c,d,e = pfizer,bbiotech,sputnik,astra,moderna
fig = px.line(a, x="date", y="Avg Polarity", title='Pfizer')
fig.show()
fig = px.line(b, x="date", y="Avg Polarity", title='Bharat Biotech')
fig.show()
fig = px.line(c, x="date", y="Avg Polarity", title='Sputnik')
fig.show()
fig = px.line(d, x="date", y="Avg Polarity", title='AstraZence/Covishield')
fig.show()
fig = px.line(e, x="date", y="Avg Polarity", title='Moderna')
fig.show()

# Visualizing Overall Vaccine Polarity

In [None]:
total=pd.DataFrame()
total['date'] = sorted(deep['date'].unique())
senti=list()
for date in total['date']:
    senti.append(deep[deep['date']==date].polarity.mean())
total['Sentiment']=senti
fig = px.line(total, x="date", y="Sentiment", title='Overall Sentiment around Vaccines')
fig.show()   