In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Related Imports

In [None]:
!pip install tweet-preprocessor
!pip install pyjanitor
!pip install geopy

In [None]:
import seaborn as sns
import preprocessor as pre_process
#important libraries for preprocessing using NLTK
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('vader_lexicon')
from sklearn.cluster import KMeans
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import TweetTokenizer
from janitor import then
import re
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [None]:
tweets_dataframe = pd.read_csv('../input/pfizer-vaccine-tweets/vaccination_tweets.csv')


In [None]:
# converting hashtags columns to store list of hastags in list format
tweets_dataframe['hashtags']=tweets_dataframe.apply(lambda row:str(row['hashtags']).strip('][').split(', '),axis=1 ) 

# **EDA on the tweet data**

In [None]:
tweets_dataframe.head(5)

In [None]:
tweets_dataframe.columns

In [None]:
tweets_dataframe['user_name'].describe()

In [None]:
tweets_dataframe['user_verified'].value_counts()

In [None]:
sns.countplot(x='user_verified',data=tweets_dataframe)

In [None]:
print('Different sources of tweeting')
tweets_dataframe['source'].value_counts()

    It is very clear that most of the users tweeting about the vaccine aren't verfied by twitter.



In [None]:
tweets_dataframe.head(5)

We begin with cleaning of tweets columns. The usual noises from the tweets is handled by the tweet-preprocessor library. It takes care of the URLs, Mentions, reserved words, hashtags, emojis, smileys. Please note we have a hashtag column but they aren't removed from the tweets.

In [None]:
def lower(tweet):
    tweet_removed_digits_and_lowered = tweet.lower()
    return tweet_removed_digits_and_lowered

In [None]:
def remove_punctuation(words):
 new_words = []
 for word in words:
    new_word = re.sub(r'[^\w\s]', '', (word))
    if new_word != '':
       new_words.append(new_word)
 return new_words

In [None]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(word_list):
    removed_stopwords = [word for word in word_list if word not in stop_words]
    return removed_stopwords
    

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()
def lemmatize_text(text):
 return [(lemmatizer.lemmatize(w)) for w in \
                                     w_tokenizer.tokenize((text))]

In [None]:

def clean_tweets(tweet):
    cleaned_text = pre_process.clean(tweet)
    return cleaned_text

In [None]:
tweets_dataframe['text']

In [None]:
preprocessed_tweets = (tweets_dataframe
                      .apply(lambda row:clean_tweets(row['text']),axis=1)
                      .apply(lambda row:lower(row))
                      )
tweets_dataframe['processed_tweets'] = preprocessed_tweets

In [None]:
preprocessed_tweets


In [None]:
!pip install wordcloud
from wordcloud import WordCloud

In [None]:
def create_word_cloud(list_of_words, title, max_words=100):
    fdist = FreqDist(list_of_hashtags)
    del fdist['nan']
    wc = WordCloud(width=700, height=300, max_words=100).generate_from_frequencies(fdist)
    plt.figure(figsize=(12,10))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()
    

In [None]:
list_of_hashtags = []
for hashtags in list(tweets_dataframe['hashtags']):
    list_of_hashtags.extend(hashtags)
    
create_word_cloud(list_of_hashtags, 'common hashtags')



We will be using the below function to extract the country details from the user location details given in the dataframe. Refer to [Geopy Documentation](https://geopy.readthedocs.io/en/stable/#) for more details on the use of the module.

In [None]:

geocoder=Nominatim(user_agent='tweet_analysis')
geocoder_1 = RateLimiter(geocoder.geocode, min_delay_seconds=1.1)
geocoder_2 = RateLimiter(geocoder.reverse, min_delay_seconds=1.1)

The below function uses the geopy module to get countries data from the given user locations. **

In [None]:
def get_country(location):
    location = str(location)
    if location=='nan':
        return np.nan
    location_string = geocoder_1(location)
    if(location_string is None):
        return np.nan
    longitude = location_string.longitude
    latitude = location_string.latitude
    location_details = geocoder_2(str(latitude) + " " + str(longitude))
    country = location_details.raw['address']['country']
    return country

Please note running the block below takes a lot of time due to multiple API requests. I have ran it once and saved the dataset for further analysis.

In [None]:
# # code to get the country tags for the dataset
# tweets_dataframe_test = tweets_dataframe.head(100)
# tweets_dataframe['country'] = tweets_dataframe.apply(lambda row: get_country(row['user_location']), axis=1)
# tweets_dataframe.to_csv('data_with_countries.csv')

I will be directly importing the data with country tags

In [None]:
tweets_with_countries = pd.read_csv('../input/pfizer-tweet-dataset/data_with_countries.csv')
tweets_dataframe['country'] = tweets_with_countries['country']

In [None]:
top = tweets_dataframe['country'].value_counts()[:10]

In [None]:
top

In [None]:
top_countries = list(top.index)
top_tweets = list(top)
top_countries.append('Others')
top_tweets.append(len(tweets_dataframe)-sum(top_tweets))

In [None]:
pie, ax = plt.subplots(figsize=[12,12])
labels = top_countries
plt.pie(x=top_tweets, autopct="%.1f%%", explode=[0.07]*len(top_tweets), labels=labels, pctdistance=0.5)
plt.title("Coutry Share in tweets", fontsize=14);

In [None]:
tweets_dataframe.columns

This a time based analysis to show the variying sentiments with time. A very intriguing and detailed time based analysis can be found in [this](https://www.kaggle.com/thomaskonstantin/pfizer-vaccine-sentiment-and-time-series-analysis/notebook#notebook-container) notebook

In [None]:
sid = SIA()
sentiments = tweets_dataframe.apply(lambda row: sid.polarity_scores(row['processed_tweets']),axis=1)
sid = SIA()
tweets_dataframe['sentiments']           = sentiments
tweets_dataframe['Positive Sentiment']   = tweets_dataframe['sentiments'].apply(lambda x: x['pos']+1*(10**-6)) 
tweets_dataframe['Neutral Sentiment']    = tweets_dataframe['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
tweets_dataframe['Negative Sentiment']   = tweets_dataframe['sentiments'].apply(lambda x: x['neg']+1*(10**-6))



In [None]:
# plt.subplot(2,4,1)
plt.title('Distribution Of Sentiments Across Our Tweets',fontsize=19,fontweight='bold')
sns.kdeplot(tweets_dataframe['Negative Sentiment'],bw=0.1)
sns.kdeplot(tweets_dataframe['Positive Sentiment'],bw=0.1)
sns.kdeplot(tweets_dataframe['Neutral Sentiment'],bw=0.1)


In [None]:
plt.subplot(2,1,2)
plt.title('CDF Of Sentiments Across Our Tweets',fontsize=19,fontweight='bold')
sns.kdeplot(tweets_dataframe['Negative Sentiment'],bw=0.1,cumulative=True)
sns.kdeplot(tweets_dataframe['Positive Sentiment'],bw=0.1,cumulative=True)
sns.kdeplot(tweets_dataframe['Neutral Sentiment'],bw=0.1,cumulative=True)
plt.xlabel('Sentiment Value',fontsize=19)
plt.show()

For further analysis let's assign an overall sentiment to a tweet which will be the max of the three sentiments given by Vader

In [None]:
tweets_dataframe['overall_sentiment'] = tweets_dataframe.apply(lambda row: max(row['sentiments'], key=lambda key: row['sentiments'][key]),axis=1)
    

Country wise share in sentiments.

In [None]:
tweets_dataframe[tweets_dataframe['overall_sentiment']=='pos']

In [None]:
def country_wise_sentiment_plot(sentiment):
    dataframe = tweets_dataframe[tweets_dataframe['overall_sentiment']==sentiment]
    top = dataframe['country'].value_counts()[:5]
    top_countries = list(top.index)
    top_tweets = list(top)
    top_countries.append('Others')
    top_tweets.append(len(dataframe)-sum(top_tweets))
    pie, ax = plt.subplots(figsize=[12,12])
    labels = top_countries
    plt.pie(x=top_tweets, autopct="%.1f%%", explode=[0.07]*len(top_tweets), labels=labels, pctdistance=0.5)
    plt.title("Coutry Share in tweets", fontsize=14)
    plt.show()

In [None]:
country_wise_sentiment_plot('pos')

In [None]:
country_wise_sentiment_plot('neg')

In [None]:
country_wise_sentiment_plot('neu')

In [None]:
list_of_hashtags = []
for hashtags in list(tweets_dataframe[tweets_dataframe['overall_sentiment']=='pos']['hashtags']):
    list_of_hashtags.extend(hashtags)
    
create_word_cloud(list_of_hashtags, 'common hashtags for positive sentiment')

In [None]:
list_of_hashtags = []
for hashtags in list(tweets_dataframe[tweets_dataframe['overall_sentiment']=='neg']['hashtags']):
    list_of_hashtags.extend(hashtags)
create_word_cloud(list_of_hashtags, 'common hashtags for negative sentiment')