# **Pfizer Vaccine Tweets**

The data iss collected from recent tweets about Pfizer & BioNTech vaccine using tweepy Python package to access Twitter API.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_palette('Set2')
sns.set_style('darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Load the data**

In [None]:
df = pd.read_csv("/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv")
print (df.shape)
df.head()

# **EDA**

## **Types of Twitter account**

In [None]:
dict_ = df['user_verified'].value_counts().to_dict()
dict_['Verified'] = dict_.pop(True)
dict_['Not-Verified'] = dict_.pop(False)

plt.figure(figsize=(7,7))
plt.pie(x=dict_.values(), labels=dict_.keys(), autopct='%1.1f%%', shadow=True, startangle=0, explode = [0.1, 0])
plt.show()

## **Top 5 Sources (Platform) of tweets**

In [None]:
from collections import Counter
dict_ = Counter(df['source'].tolist()).most_common(5)
temp = pd.DataFrame(dict_, columns=['Source', 'Count'])

plt.figure(figsize=(7,7))
plt.pie(x=temp['Count'], labels=temp['Source'], autopct='%1.1f%%', shadow=True, startangle=0)
plt.show()

## **Top 15 users with most number of tweets**

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(y='user_name', data=df, order=df['user_name'].value_counts().index[:15])
plt.xlabel('Number of Tweets', weight='bold')
plt.ylabel('User Name', weight='bold')
plt.show()

## **Top 15 locations with most number of users**

Contains noise or non-location entities as well

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(y='user_location', data=df, order=df['user_location'].value_counts().index[:15])
plt.xlabel('Number of User', weight='bold')
plt.ylabel('Location', weight='bold')
plt.show()

## **Number of Followers and Retweets for each type of user**

In [None]:
df_temp = df.copy()
df_temp['user_verified'] = df_temp['user_verified'].astype('str')
df_temp['user_verified'] = df_temp['user_verified'].str.replace('False','No')
df_temp['user_verified'] = df_temp['user_verified'].str.replace('True','Yes')

fig, ax = plt.subplots(2,1,figsize=(20,10))
sns.boxplot(y='user_verified', x='user_followers', data=df_temp, ax=ax[0])
ax[0].set_xscale('log')
ax[0].set_xlabel("Number of Followers", weight='bold')
ax[0].set_ylabel('Verified User?', weight='bold')

sns.boxplot(y='user_verified', x='retweets', data=df_temp, ax=ax[1])
ax[1].set_xscale('log')
ax[1].set_xlabel("Number of ReTweets", weight='bold')
ax[1].set_ylabel('Verified User?', weight='bold')
plt.show()

## **Number of Hashtags and Length of Text per Tweet**

In [None]:
fig, ax = plt.subplots(1,2, figsize=(18, 10))
sns.distplot(df['hashtags'].dropna().apply(lambda x: len(x.split(','))).tolist(), kde=False, ax=ax[0], color='red')
ax[0].set_xlabel("Number of Hashtags", weight='bold')
ax[0].set_ylabel('Number of Tweets', weight='bold')

sns.distplot(df['text'].str.len().tolist(), kde=False, ax=ax[1], color='green')
ax[1].set_xlabel("Length of Tweet", weight='bold')
ax[1].set_ylabel('Number of Tweets', weight='bold')
ax[1].set_yscale('log')
plt.show()

## **Heatmap (correlation) among various numerical features**

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df.drop(columns=['id','is_retweet']).corr(), square=True, annot=True)
plt.show()

## **WordCloud of Hashtags used**

In [None]:
import itertools
from wordcloud import WordCloud

list_hashtags = df['hashtags'].dropna().str.lstrip('[').str.rstrip(']').str.replace("'", "").str.split(', ').tolist()
list_hashtags = list(itertools.chain(*list_hashtags))

plt.figure(figsize=(14,10))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(' '.join(list_hashtags))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

## **WordCloud of text in Tweets**

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import string

list_stopwords = set(stopwords.words('english') + list(punctuation))
df_temp = df[['text', 'hashtags']]
df_temp['text'] = df_temp['text'].str.lower()
df_temp['text'] = df_temp['text'].apply(word_tokenize)
df_temp['text'] = df_temp['text'].apply(lambda x: [word for word in x if word not in list_stopwords])
df_temp['text'] = df_temp['text'].apply(lambda x : [word.translate(str.maketrans('', '', string.punctuation)) for word in x])
df_temp['text'] = df_temp['text'].apply(lambda x : [word for word in x if len(word) > 1])

In [None]:
list_text = df_temp['text'].tolist()
list_text = list(itertools.chain(*list_text))

plt.figure(figsize=(14,10))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(' '.join(list_text))
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

# **This work is in progress. Feel free to Upvote and give Feedback.**