In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

 # linear algebra
# data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# A Sentiment Analysis of Pfizer Vaccine via Tweets

1. Introduction
2. Data analysis;
3. Method for sentiment analysis;
4. Results;
5. Comments and conclusions.



# 1. Introduction
Here we will perform a sentiment analysis on the Pfizer vaccine tweets.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
import os
!ls ../input/monkeylearn-pfizer-sentiment-analysis-images

In [None]:
pfizerTweets_df = pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv')

In [None]:
pfizerTweets_df.head()

In [None]:
pfizerTweets_df['is_retweet'].value_counts()

In [None]:
There are no retweets.

In [None]:
pfizerTweets_df.info()

I will remove columns that I don't need for the sentiment analysis such as:
1. user_description
2. user_created
3. user_followers
4. user_friends
5. user_favourites
6. user_verified
7. hashtags
8. retweets
9. favorites
10. is_retweet

In [None]:
pfizerTweets_df = pfizerTweets_df.drop(['user_name',
 'user_description',
 'user_created',
 'user_followers',
 'user_friends',
 'user_favourites',
 'user_verified',
 'hashtags',
 'retweets',
 'favorites',
 'is_retweet'],1)

In [None]:
list(pfizerTweets_df)

Check for duplicate tweets

In [None]:
sum(pfizerTweets_df['id'].duplicated())

In [None]:
sum(pfizerTweets_df['text'].duplicated())

Change the column text and source to text format

In [None]:
pfizerTweets_df["source"] = pfizerTweets_df["source"].str.replace(r'<(?:a\b[^>]*>|/a>)', '')
pfizerTweets_df["text"] = pfizerTweets_df["text"].str.replace(r'<(?:a\b[^>]*>|/a>)', '')

In [None]:
pfizerTweets_df['date'].dtypes

In [None]:
pfizerTweets_df['date'] = pd.to_datetime(pfizerTweets_df['date'])

In [None]:
pfizerTweets_df.info()

The date column was converted to datetime64 format instead of an object

# 2. Data analysis;


In [None]:
da_df = pfizerTweets_df.id.groupby([pfizerTweets_df["date"].rename('Year').dt.year, pfizerTweets_df["date"].rename('Month').dt.month]).count()
da_df.plot(kind = 'bar', figsize=(20,10), grid = True, fontsize = 18 , x = 'date', title = 'Tweet mentions of Pfizer Vaccine').set_ylabel('Number of tweets')
ax = plt.gca()
plt.show();

From this graph we can see that the data contains tweets from December 2020 and January 2021.

# 3. Method for sentiment analysis;

For the sentiment analysis I used MonkeyLearn. https://monkeylearn.com/

now we will export this dataframe.

In [None]:
pfizerTweets_df.to_csv('cleaned_pfizer_vaccine_tweets.csv', index = False, encoding = 'utf-8')

## The Sentiment analysis was done using MonkeyLearn.
82 texts have been used to train the model.

The result can be found in the ../input/monkeylearn-pfizer-vaccine-300-tweets/processed_batch 300 tweets.csv

This is the overall Confidence level of the model.

In [None]:
Image("../input/monkeylearn-pfizer-sentiment-analysis-images/Overall.png")

monkeylearn-pfizer-sentiment-analysis-images

This is the Negative Confidence level of the model.

In [None]:
Image("../input/monkeylearn-pfizer-sentiment-analysis-images/Negative.png")

This is the Neutral Confidence level of the model.

In [None]:
Image("../input/monkeylearn-pfizer-sentiment-analysis-images/Neutral.png")

This is the Positive Confidence level of the model.

In [None]:
Image("../input/monkeylearn-pfizer-sentiment-analysis-images/Positive.png")

# 4. Results;
These are the results.

In [None]:
sentiment_df = pd.read_csv('../input/monkeylearn-pfizer-vaccine-300-tweets/processed_batch 300 tweets.csv')

In [None]:
no_tweets =  len(sentiment_df.query('Classification == "Negative"')) + len(sentiment_df.query('Classification == "Neutral"')) + len(sentiment_df.query('Classification == "Positive"'))
no_tweets

In [None]:
labels = 'Negative', 'Neutral','Positive'
sizes = [(len(sentiment_df.query('Classification == "Negative"'))/no_tweets),(len(sentiment_df.query('Classification == "Neutral"'))/no_tweets),(len(sentiment_df.query('Classification == "Positive"'))/no_tweets)]
plt.suptitle('Percentage of tweets', fontsize=24)
explode = (0, 0, 0)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

As shown from the pie chart most of the tweets were neutral at 62.3%.
Positive at 29.3% and Negative at 8.3%

In [None]:
sentiment_df.user_location = sentiment_df.user_location.str.replace('^[a-z]+','None')
sentiment_df ['user_location'] = sentiment_df ['user_location'].astype(str)
sentiment_relation["user_location"] = sentiment_relation["user_location"].str.replace(r'<(?:a\b[^>]*>|/a>)', '')
sentiment_df.info()

In [None]:
sentiment_relation = sentiment_df.groupby(['user_location', 'Classification'], as_index=False).count()
sentiment_relation = sentiment_relation[['user_location', 'Classification']]
plt.figure(figsize=[30,3])
plt.suptitle('Relationship between user location and sentiment ', fontsize=24)
plt.xlabel('User Location', fontsize=18)
plt.xticks(rotation=90)
plt.ylabel('Sentiment', fontsize=18)
plt.scatter(sentiment_relation.user_location, sentiment_relation.Classification);

Here the relationship between the user location and sentiment is displayed.

In [None]:
def sentiment_confidence(x):
    confidence_max, confidence_min = 1, 0
    if(x>confidence_max): 
        x=confidence_max
    elif(x<confidence_min):
        x=confidence_min
    return x
sentiment_df.Confidence = sentiment_df.Confidence.apply(sentiment_confidence)
plt.figure(figsize=[40,3])
plt.suptitle('Relationship between user location and sentiment with confidence level of sentiment', fontsize=24)
plt.xlabel('User Location', fontsize=18)
plt.xticks(rotation=90)
plt.ylabel('Sentiment', fontsize=18)
plt.scatter(x=sentiment_df.user_location, y=sentiment_df.Classification, alpha=.9, c=sentiment_df.Confidence)
colorBar = plt.colorbar();
colorBar.set_label("Confidence",fontsize=18, labelpad=+2)
plt.show()

This graph shows the sentiment of the tweets including the confidence level of each sentiment.


# 5. Comments and conclusions.

Most of the tweets analyzed were neutral as they were reporting facts and have neither a positive or negative sentiment.
The sentiment analysis was done for only 300 tweets as it is a limit by my acount at MonkeyLearn.