In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import pandas as pd
import numpy as np
import warnings
import re
from textblob import TextBlob
import nltk
nltk.download('all')

warnings.filterwarnings("ignore")

tweets = pd.read_csv("/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv")

### Let us have a look at the data

In [None]:
tweets.head()

In [None]:
print("There are total {} tweets in our data".format(tweets.shape[0]))

# id

### Has any user tweeted twice??

In [None]:
print("Number of unique twitter ids is {}".format(len(tweets["id"].unique())))

### Since the number of tweets is same as number of unique twitter ids we can be sure that no user has tweeted twice

# user_location	

In [None]:
print("There are total of {} unique locations from where these tweets have been made".format(len(tweets["user_location"].unique())))

In [None]:
most_common_loc = list(tweets["user_location"].value_counts()[0:10].keys())
print("The top 10 places from where most tweets have been made are :-")
print([loc for loc in most_common_loc])

### Custom function for text preprocessing

In [None]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r"https://.*","",text)
    text = re.sub(r"\n","",text)
    text = re.sub(r"#","",text)
    text = re.sub(r"@\w+","",text)
    text = re.sub(r'\d+(.*?)[\u263a-\U0001f645]',"",text)
    text = re.sub(r'\d+','',text)
    return text

### Let us analyze the tweets of a verified account and find if they are positive in negetive in nature

In [None]:
verified_user_tweets = tweets.loc[(tweets["user_verified"]==True),"text"].values

In [None]:
unverified_user_tweets = tweets.loc[(tweets["user_verified"]==False),"text"].values

### Custom function for getting the polarity and subjectivity

In [None]:
def get_sentiment_for_tweets(tw):
    total_polarity = 0
    total_subjectivity = 0
    avg_pol = 0
    avg_sub = 0
    for i in range(len(tw)):
        blob = TextBlob(text_preprocessing(tw[i]))
        total_polarity += blob.polarity
        total_subjectivity += blob.subjectivity
    avg_pol = total_polarity/len(tw)
    avg_sub = total_subjectivity/len(tw)
    print("Polarity : {} | Subjectivity : {}".format(avg_pol,avg_sub))

### Polarity and Subjectivity of tweets of verified accounts

In [None]:
get_sentiment_for_tweets(verified_user_tweets)

### Polarity and Subjectivity of tweets of unverified accounts

In [None]:
get_sentiment_for_tweets(unverified_user_tweets)

### Thus we see that for both verified and unverified tweeter accounts the tweets are having a positive sentiment regarding Covid-19 vaccine and a low subjectivity shows that the tweets are based on facts and does not represent much personal opinion.

### Let us see the locations where we are having positive and negative tweets along with the factor of subjectivity of them

### Custom made function for fetching location based on sentiment

In [None]:
def find_location_for_sentiment(dataset,sentiment,threshold):
    try:
        for location in dataset["user_location"].unique():
            total_polarity = 0
            total_subjectivity = 0
            avg_pol = 0
            avg_sub = 0
            tw = dataset.loc[(dataset["user_location"]==location),"text"].values
            if(len(tw)>0):
                for i in range(len(tw)):
                    blob = TextBlob(text_preprocessing(tw[i]))
                    total_polarity += blob.polarity
                    total_subjectivity += blob.subjectivity
                avg_pol = total_polarity/len(tw)
                avg_sub = total_subjectivity/len(tw)
                if(sentiment.lower()=="pos"):
                    if(avg_pol>threshold):
                        print("Location : {} | Polarity : {} | Subjectivity : {}".format(location,avg_pol,avg_sub))
                elif(sentiment.lower()=="neg"):
                    if(avg_pol<threshold):
                        print("Location : {} | Polarity : {} | Subjectivity : {}".format(location,avg_pol,avg_sub))
    except Exception as e:
        print(e)

### Locations from where highly positive tweets regarding Covid19 vaccine has been tweeted

### Highly positive (polarity>0.9) a positive tweet would have polarity between 0 and 1. We are selecting the threshold as 0.9 to account for the most positive tweets

In [None]:
find_location_for_sentiment(tweets,'pos',0.9)

### Locations from where negative tweets regarding Covid19 vaccine has been tweeted

### Negative tweets of polarity < -0.5 (negative tweets have polarity between 0 and -1)

In [None]:
find_location_for_sentiment(tweets,'neg',-0.5)

### Let us have a look at a positive tweet

In [None]:
tweets.loc[tweets["user_location"]=="British Columbia","text"].values[0]

### Indeed the tweet has the word happy with a happy smiley. No wonder it has got a positive sentiment

### Let us have a look at a negetive tweet

In [None]:
tweets.loc[tweets["user_location"]=="Hampshire","text"].values[0]

### The tweet contains word like "I don't agree" which justifies it being classified as negative sentiment.