![](https://images.livemint.com/img/2020/12/02/600x338/AFP_8WB4P7_1606897752999_1606897771548.jpg)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:gold; border:0' role="tab" aria-controls="home" color=black><center>Quick navigation</center></h3>

* [1. Required Libraries](#1)
* [2. Dataset Quick Overview](#2)
* [3. Features exploration](#3)
* [4. Tweets text analysis](#4)   
* [5. References](#5)

    Kindly, Upvote the notebook!

<a id="1"></a>
<h2 style='background:gold; border:0; color:black'><center>Required Libraries</center><h2>

In [None]:
import numpy as np 
import pandas as pd 
import os
import itertools

#plots
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer

from PIL import Image
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))
from nltk.util import ngrams

import missingno as mno

import re
from collections import Counter

import nltk
from nltk.corpus import stopwords

import requests
import json

import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})

import warnings
warnings.filterwarnings("ignore")

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#Importing the dataset 
tweets_df = pd.read_csv("../input/pfizer-vaccine-tweets/vaccination_tweets.csv")

<a id="2"></a>
<h2 style='background:gold; border:0; color:black'><center>Dataset Quick Overview</center><h2>

# Let's take a quick peek of how the data looks!


In [None]:
tweets_df.info()

* The dataset has around 6821 records with 16 columns.
* The dataset consists of 6 integer variables, 8 object variables and 2 boolean variables


In [None]:
#Note: Describing a pandas data frame works only for integer values,
#The below box plot also shows how the values are distributed for both int64 and bool variable types
print(tweets_df.describe())
sns.boxplot(data=tweets_df, orient="h", palette="Set2")

### Missing values:

In [None]:
#Missingno library help you give overall visualization of the missing values in a data!
mno.matrix(tweets_df)

In [None]:
missed = pd.DataFrame()
missed['column'] = tweets_df.columns

missed['percent'] = [round(100* tweets_df[col].isnull().sum() / len(tweets_df), 2) for col in tweets_df.columns]
missed = missed.sort_values('percent',ascending=False)
missed = missed[missed['percent']>0]

fig = sns.barplot(
    x=missed['percent'], 
    y=missed["column"], 
    orientation='horizontal'
).set_title('Missed values percent for every column')

# Reasons for missing values!

Sometimes a user doesnt add his/her description in the bio and make a tweet without any hashtags! 

### Let's Look into the unique values 

In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    tt['Percentage']=tt['Uniques']/tt['Total']
    return(np.transpose(tt))

In [None]:
unique_values(tweets_df)

### Nearly 66% of the user name, user description are unique and around 37% locations are unique.

## Most frequent values

In [None]:
def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        itm = data[col].value_counts().index[0]
        val = data[col].value_counts().values[0]
        items.append(itm)
        vals.append(val)
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))

In [None]:
most_frequent_values(tweets_df)

In [None]:
# He/She used the same message but tagging different people for higher reach and support to his/her claim!
tweets_df[tweets_df['user_name']=='New Straits Times']['text'].iloc[4:7]

<a id="3"></a>
<h2 style='background:gold; border:0; color:black'><center>Features exploration</center><h2>

## One of the key aspect is to understand the data thoroughly! Let's start with understanding the distribution of some of the key features !

### Digging into the distribution of the user_name feature!

In [None]:
def plot_count(feature, title, df, size=1, ordered=True):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    if ordered:
        g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='Set3')
    else:
        g = sns.countplot(df[feature], palette='Set3')
    g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()

plot_count("user_name", "User name", tweets_df,4)

### Lets Visualize the top 20 users by number of tweets


In [None]:
ds = tweets_df['user_name'].value_counts().reset_index()
ds.columns = ['user_name', 'tweets_count']
ds = ds.sort_values(['tweets_count'],ascending=False)
tweets_df = pd.merge(tweets_df, ds, on='user_name')

fig = sns.barplot( 
    x=ds.head(20)["tweets_count"], 
    y=ds.head(20)["user_name"], 
    orientation='horizontal'
).set_title('Top 20 users by number of tweets') 



### Similiarly let's see how the user_location feature is distributed!!

In [None]:
plot_count("user_location", "User location", tweets_df,4)

### Most of the tweets are from UK(London), Malaysia, India, London and Canada

### Let's proceed with the tweet Source attribute distribution

In [None]:
ds = tweets_df['source'].value_counts().reset_index()
ds.columns = ['source', 'count']
ds = ds.sort_values(['count'],ascending=False)

fig = sns.barplot(
    x=ds.head(10)["count"], 
    y=ds.head(10)["source"], 
    orientation='horizontal', 
    #title='Top 40 user sources by number of tweets', 
    #width=800, 
    #height=800
).set_title('Top 10 user sources by number of tweets')

### Most of the users prefer twitter ios app compared to android app and web app.

In [None]:
tweets_df['user_created'] = pd.to_datetime(tweets_df['user_created'])
tweets_df['year_created'] = tweets_df['user_created'].dt.year
data = tweets_df.drop_duplicates(subset='user_name', keep="first")
data = data[data['year_created']>1970]
data = data['year_created'].value_counts().reset_index()
data.columns = ['year', 'number']

fig = sns.barplot( 
    x=data["year"], 
    y=data["number"], 
    orientation='vertical'
    #title='', 
).set_title('User created year by year')


* 2009 has the highest number of users followed by the year 2020
* The amount of users in 2007, 2008 is very less.
* A gradual linear decrease is observed from the year 2011 till 2016 and increases significantly in the year 2019 and 2020.

### Total number of tweets for users and number of hashtags in every tweet

In [None]:
tweets_df['hashtags'] = tweets_df['hashtags'].fillna('[]')
tweets_df['hashtags_count'] = tweets_df['hashtags'].apply(lambda x: len(x.split(',')))
tweets_df.loc[tweets_df['hashtags'] == '[]', 'hashtags_count'] = 0
fig = sns.scatterplot( 
    x=tweets_df['hashtags_count'], 
    y=tweets_df['tweets_count']
).set_title('Total number of tweets for users and number of hashtags in every tweet')

* As the number of tweets increase from 0 to 20, there is an substantial decrease in the number of hastags
* users who post less than 10 tweets use a range of 0 to a maximum of 11 hastags!

### Number of hashtags used in each tweet

In [None]:
ds = tweets_df['hashtags_count'].value_counts().reset_index()
ds.columns = ['hashtags_count', 'count']
ds = ds.sort_values(['count'],ascending=False)
ds['hashtags_count'] = ds['hashtags_count'].astype(str) + ' tags'
fig = sns.barplot( 
    x=ds["count"], 
    y=ds["hashtags_count"], 
    orientation='horizontal'
).set_title('Distribution of number of hashtags in tweets')

### Most people who tweeted about the pfizer covid vaccine has 0 hastag or 1 hastag!

## Number of unique users each day!

In [None]:
tweets_df['date'] = pd.to_datetime(tweets_df['date']) 
df = tweets_df.sort_values(['date'])
df['day'] = df['date'].astype(str).str.split(' ', expand=True)[0]
df['time'] = df['date'].astype(str).str.split(' ', expand=True)[1]

ds = df.groupby(['day', 'user_name'])['hashtags_count'].count().reset_index()
ds = ds.groupby(['day'])['user_name'].count().reset_index()
ds.columns = ['day', 'number_of_users']
ds['day'] = ds['day'].astype(str)
fig = sns.barplot( 
    x=ds['day'], 
    y=ds["number_of_users"], 
    orientation='vertical',
    #title='Number of unique users per day', 
    #width=800, 
    #height=800
).set_title('Number of unique users per day')
plt.xticks(rotation=90)

## Number of tweets per day!

In [None]:
tweets_df['tweet_date']=pd.to_datetime(tweets_df['date']).dt.date
tweet_date=tweets_df['tweet_date'].value_counts().to_frame().reset_index().rename(columns={'index':'date','tweet_date':'count'})
tweet_date['date']=pd.to_datetime(tweet_date['date'])
tweet_date=tweet_date.sort_values('date',ascending=False)

fig=go.Figure(go.Scatter(x=tweet_date['date'],
                                y=tweet_date['count'],
                               mode='markers+lines',
                               name="Submissions",
                               marker_color='dodgerblue'))

fig.update_layout(
    title_text='Tweets per Day : ({} - {})'.format(tweets_df['tweet_date'].sort_values()[0].strftime("%d/%m/%Y"),
                                                       tweets_df['tweet_date'].sort_values().iloc[-1].strftime("%d/%m/%Y")),template="plotly_dark",
    title_x=0.5)

fig.show()

### The highest number of tweets were recorded on Jan 8, 2021!

## Tweet distribution -Hourly

In [None]:
tweets_df['hour'] = tweets_df['date'].dt.hour
ds = tweets_df['hour'].value_counts().reset_index()
ds.columns = ['hour', 'count']
ds['hour'] = 'Hour ' + ds['hour'].astype(str)
fig = sns.barplot( 
    x=ds["hour"], 
    y=ds["count"], 
    orientation='vertical', 
).set_title('Tweets distribution over hours')
plt.xticks(rotation='vertical')


## Top 10 hastags used!

In [None]:
def split_hashtags(x): 
    return str(x).replace('[', '').replace(']', '').split(',')

tweets_df = tweets_df.copy()
tweets_df['hashtag'] = tweets_df['hashtags'].apply(lambda row : split_hashtags(row))
tweets_df = tweets_df.explode('hashtag')
tweets_df['hashtag'] = tweets_df['hashtag'].astype(str).str.lower().str.replace("'", '').str.replace(" ", '')
tweets_df.loc[tweets_df['hashtag']=='', 'hashtag'] = 'NO HASHTAG'


ds = tweets_df['hashtag'].value_counts().reset_index()
ds.columns = ['hashtag', 'count']
ds = ds.sort_values(['count'],ascending=False)
fig = sns.barplot(
    x=ds.head(10)["count"], 
    y=ds.head(10)['hashtag'], 
    orientation='horizontal', 
    #title='Top 20 hashtags', 
    #width=800, 
    #height=700
).set_title('Top 10 hashtags')
#fig.show()

<a id="4"></a>
<h2 style='background:gold; border:0; color:black'><center>Tweets text analysis</center><h2>

In [None]:
stopwords = set(STOPWORDS)
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=50,
        max_font_size=40, 
        scale=5,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(10,10))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

# Let's identify the prevalent words in tweets!!

In [None]:
show_wordcloud(tweets_df['text'], title = 'Prevalent words in tweets')

## Let's identify the prevalent words in tweets from London!

In [None]:
india_df = tweets_df.loc[tweets_df.user_location=="London"]
show_wordcloud(india_df['text'], title = 'Prevalent words in tweets from London')

## Let's identify the prevalent words in tweets from UK!

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="UK"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from UK')

## Let's identify the prevalent words in tweets from India!

In [None]:
india_df = tweets_df.loc[tweets_df.user_location=="India"]
show_wordcloud(india_df['text'], title = 'Prevalent words in tweets from India')

## Let's identify the prevalent words in tweets from Canada!

In [None]:
india_df = tweets_df.loc[tweets_df.user_location=="Canada"]
show_wordcloud(india_df['text'], title = 'Prevalent words in tweets from Canada')

## Let's identify the prevalent words in tweets from USA!

In [None]:
us_df = tweets_df.loc[tweets_df.user_location=="United States"]
show_wordcloud(us_df['text'], title = 'Prevalent words in tweets from US')

# Hashtag analysis is crucial as every user tries to use one or more hashtags inorder to reach to a wider audience!

# Let's identify the prevalent words in hastags!

In [None]:
tweets_df['hashtags'] = tweets_df['hashtags'].replace(np.nan, "['None']", regex=True)
tweets_df['hashtags'] = tweets_df['hashtags'].apply(lambda x: x.replace('\\N',''))

tweets_df['hashtags_individual'] = tweets_df['hashtags'].apply(lambda x: x.split(','))
from itertools import chain
all_hashtags = set(chain.from_iterable(list(tweets_df['hashtags_individual'])))
print(f"There are totally: {len(all_hashtags)}")


show_wordcloud(tweets_df['hashtags_individual'], title = 'Prevalent words in hashtags')

## Refining the text(Important step) and visualizing with violin plot to understand the distribution of the text

In [None]:
def remove_tag(string):
    text=re.sub('<.*?>','',string)
    return text
def remove_mention(text):
    line=re.sub(r'@\w+','',text)
    return line
def remove_hash(text):
    line=re.sub(r'#\w+','',text)
    return line

def remove_newline(string):
    text=re.sub('\n','',string)
    return text
def remove_url(string): 
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',string)
    return text
def remove_number(text):
    line=re.sub(r'[0-9]+','',text)
    return line
def remove_punct(text):
    line = re.sub(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*','',text)
    return line
def text_strip(string):
    line=re.sub('\s{2,}', ' ', string.strip())
    return line
def remove_thi_amp_ha_words(string):
    line=re.sub(r'\bamp\b|\bthi\b|\bha\b',' ',string)
    return line

In [None]:
tweets_df['refine_text']=tweets_df['text'].str.lower()
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_tag(str(x)))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_mention(str(x)))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_hash(str(x)))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_newline(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_url(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_number(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_punct(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:remove_thi_amp_ha_words(x))
tweets_df['refine_text']=tweets_df['refine_text'].apply(lambda x:text_strip(x))

tweets_df['text_length']=tweets_df['refine_text'].str.split().map(lambda x: len(x))

In [None]:
fig = go.Figure(data=go.Violin(y=tweets_df['text_length'], box_visible=True, line_color='black',
                               meanline_visible=True, fillcolor='royalblue', opacity=0.6,
                               x0='Tweet Text Length'))

fig.update_layout(yaxis_zeroline=False,title="Distribution of Text length",template='ggplot2')
fig.show()

* Average length of the Covid vaccine tweet: 10.93
* Median length of the Covid vaccine tweet:11
* Interquartile lie between : 7 and 15
* Min: 1
* Max: 27

## Listing below the top N-gram sequential words used in Covid Vaccine tweets

In [None]:
def ngram_df(corpus,nrange,n=None):
    vec = CountVectorizer(stop_words = 'english',ngram_range=nrange).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    total_list=words_freq[:n]
    df=pd.DataFrame(total_list,columns=['text','count'])
    return df
unigram_df=ngram_df(tweets_df['refine_text'],(1,1),20)
bigram_df=ngram_df(tweets_df['refine_text'],(2,2),20)
trigram_df=ngram_df(tweets_df['refine_text'],(3,3),20)

fig = make_subplots(
    rows=3, cols=1,subplot_titles=("Unigram","Bigram",'Trigram'),
    specs=[[{"type": "scatter"}],
           [{"type": "scatter"}],
           [{"type": "scatter"}]
          ])

fig.add_trace(go.Bar(
    y=unigram_df['text'][::-1],
    x=unigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=unigram_df['count'],
    textposition = "outside",
    orientation="h",
    name="Months",
),row=1,col=1)

fig.add_trace(go.Bar(
    y=bigram_df['text'][::-1],
    x=bigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=bigram_df['count'],
     name="Days",
    textposition = "outside",
    orientation="h",
),row=2,col=1)

fig.add_trace(go.Bar(
    y=trigram_df['text'][::-1],
    x=trigram_df['count'][::-1],
    marker={'color': "blue"},  
    text=trigram_df['count'],
     name="Days",
    orientation="h",
    textposition = "outside",
),row=3,col=1)

fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text='Top N Grams',xaxis_title=" ",yaxis_title=" ",
                  showlegend=False,title_x=0.5,height=1200,template="plotly_dark")
fig.show()

* Vaccine, covid and dose are the most used unigrams
* covid vaccine, second dose, dose vaccine are the most used bi-grams
* pfizer covid vaccine, pfizer bio and tech convid vaccine, and red crescent announced was the most used tri-gram!

<a id='5'></a>

## References:

* https://www.kaggle.com/kaushiksuresh147/covid-vaccine-eda
* https://www.kaggle.com/gpreda/coronavirus-covid-19-tweets
* https://www.kaggle.com/kaushiksuresh147/ipl2020-twitter-analysis-eda


<h2 style='background:black; border:0; color:gold'><center> Post your suggestions in the comments. Thank you!!</center><h2>