In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Import Data & Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
url='../input/pfizer-vaccine-tweets/vaccination_tweets.csv'
df=pd.read_csv(url)
df.head()

In [None]:
df.info()

In [None]:
df.columns

# Data Preprocessing

## Verified Account or Not 

In [None]:
df['user_verified']=df['user_verified'].apply(lambda x:'verified' if x==True else 'not_verified')

## Account Age 

In [None]:
from datetime import date
df['today']=date.today()
df['user_created']=pd.to_datetime(df['user_created']).dt.year
df['today']=pd.to_datetime(df['today'])
df['today']=df['today'].dt.year
df['acc_age']= df['today']-df['user_created']

## Days of December

In [None]:
print(max(df['date']))
print(min(df['date']))

In [None]:
df['date']=pd.to_datetime(df['date'])

In [None]:
L = ['year', 'month', 'day', 'dayofweek', 'dayofyear', 'weekofyear', 'quarter']
df = df.join(pd.concat((getattr(df['date'].dt, i).rename(i) for i in L), axis=1))

## Total Engagement

In [None]:
df['total_engagement']=df['retweets']+df['favorites']

## Tweet Length

The tweet length can be no longer than 280 characters
so let's check tweets length

In [None]:
df['text'].values[:2]

In [None]:
df['tweet_lenght']=df['text'].apply(lambda x:len(x))
df['tweet_lenght'].describe()

## Tweet Lenght Calss
**short < mean < long**

In [None]:
df['tweet_length']=df['text'].apply(lambda x:'short' if len(x)<=130 else 'long')

## Country location

In [None]:
 df['user_location'].value_counts()

# Fix Country location

In [None]:
loc_df = df['user_location'].str.split(',',expand=True)
loc_df=loc_df.rename(columns={0:'first_loc',1:'second_loc'})

### Fixing States with countries Shortcuts

In [None]:
# Remove Spaces 
loc_df['second_loc'] = loc_df['second_loc'].str.strip()
# Rename States 
state_fix = {'Ontario': 'Canada','United Arab Emirates': 'UAE','TX': 'USA','NY': 'USA'
                  ,'FL': 'USA','England': 'UK','Watford': 'UK','GA': 'USA','IL': 'USA'
                  ,'Alberta': 'Canada','WA': 'USA','NC': 'USA','British Columbia': 'Canada','MA': 'USA','ON':'Canada'
            ,'OH':'USA','MO':'USA','AZ':'USA','NJ':'USA','CA':'USA','DC':'USA','AB':'USA','PA':'USA','SC':'USA'
            ,'VA':'USA','TN':'USA','New York':'USA','Dubai':'UAE','CO':'USA'}
loc_df = loc_df.replace({"second_loc": state_fix}) 
loc_df['second_loc'].value_counts()[:20]

## Hashtags Count

In [None]:
df['Hash'] = df['text'].apply(lambda word:word.count('#'))

## Mentions Count

In [None]:
df['Men'] = df['text'].apply(lambda word:word.count('@'))

 ## Does tweet have media or not 

In [None]:
#('https://t.co/) this part in tweets refers to photos,videos
df['med'] = df['text'].apply(lambda word:word.count('https://t.co/'))
df['med'] = df['med'].apply(lambda x:'No Media' if x==0 else 'Media')

# Account Followers Class

In [None]:
df['user_followers'].value_counts()

In [None]:
df['acc_class'] = df['user_followers'].apply(lambda x:'weak'if x<=100 else ('norm' if 1000>=x>100 else 
                                                                       ('strong' if 10000>=x>1000
                                                                        else 'influencer')))
df.head()

# Transform Data Frame

In [None]:
df.columns

In [None]:
df=df[['user_name','text','date', 'acc_age','user_verified','retweets','favorites','total_engagement', 'day', 'tweet_length',
       'Hash', 'Men', 'med', 'acc_class','month']]
df_copy=df.copy()
df.head()

# Data Visualization

In [None]:
corr=df.corr()
plt.figure(figsize=(10,7))
sns.heatmap(corr,annot=True)

## Length Class 

In [None]:
plt.figure(figsize=(7,7))
sns.countplot(x='tweet_length',data=df);

## Tweet length / Hashtags

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x=df['Hash'],y=df['tweet_length'],data=df);

## Tweet length / Mentions

In [None]:
plt.figure(figsize=(7,7))
sns.barplot(x=df['tweet_length'],y=df['Men'],data=df);

# Verified Accounts

In [None]:
df['user_verified'].value_counts()

In [None]:
labels = 'not_verified', 'verified'
sizes = [1888, 319]
explode = (0.1, 0)  
plt.figure(figsize=(10,5))
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90);
plt.axis('equal');

**Only 14.5% tweets are verified.**

# Account Class

In [None]:
class_eng = df.groupby('acc_class',as_index=False).agg({'total_engagement':'sum',})

In [None]:
fig = px.bar(class_eng,
             x='acc_class',
             y='total_engagement',
             color='total_engagement',
             color_continuous_scale='Rainbow',
             title='Engagement By Account_Class')
fig.show()

## Media / No Media

In [None]:
Media = len(df[df['med']=='Media'])
No_Media = len(df[df['med']=='No Media'])
Platform = ['Media','No Media']
Count = [Media,No_Media]
#====
fig = px.pie(names = Platform,
             values = Count,
             title='Media/No Media',
            color_discrete_sequence = px.colors.sequential.Rainbow)
fig.update_traces(textposition='inside', textinfo='percent+label')

## Engagement with respect to Date

In [None]:
line = df.groupby('date',as_index=False).agg({'total_engagement':'sum'})
fig = go.Figure()
fig.add_trace(go.Scatter(x=line.date, y=line.total_engagement,
                    mode='lines+markers'))

## Days of December

In [None]:
december=df.loc[df['month']==12]
day_december = december.groupby('day',as_index=False).agg({'total_engagement':'sum'})

fig = px.scatter(day_december,
                 x='day',
                 y='total_engagement',
                 color_continuous_scale='Rainbow',
                 color='total_engagement',
                 size='total_engagement',
                 title='Most engaged days in December')
fig.show()


## Most of the engagements were from 12 to 14 December.

## Accounts per Engagements

In [None]:
ret = df.groupby('user_name',as_index=False).agg({'retweets':'sum'}).sort_values('retweets',ascending=False).head(10)
like = df.groupby('user_name',as_index=False).agg({'favorites':'sum'}).sort_values('favorites',ascending=False).head(10)
tot_eng = df.groupby('user_name',as_index=False).agg({'total_engagement':'sum'}).sort_values('total_engagement',ascending=False).head(10)


fig = px.bar(tot_eng,
             x='user_name',
             y='total_engagement',
             color='total_engagement',
             color_continuous_scale='Viridis',
             title='Accounts per Engagements')
fig.show()

## Account age per Engagement

In [None]:
age=df.groupby('acc_age',as_index=False).agg({'total_engagement':'sum'})
px.line(age,x='acc_age',y='total_engagement',labels={'x':'age','y':'engagement'})

In [None]:
import seaborn as sns 
plt.figure(figsize=(14,7))
sns.countplot(x='acc_age',data=df_copy);

In [None]:
df3=pd.DataFrame(loc_df['second_loc'].value_counts()[:20]).reset_index()
df3

## Location/Tweets

In [None]:
fig = px.choropleth(df3, locations = df3['index'],
                    color = df3['second_loc'],locationmode='country names',hover_name = df3['second_loc'], 
                    color_continuous_scale = px.colors.sequential.Inferno)
fig.update_layout(title='Sales tracking')
fig.show()

# Tweets NLP Analysis

## Get keywords out of tweets

In [None]:
tweets = df['text']

In [None]:
all_sentences = []

for word in tweets:
    all_sentences.append(word)

all_sentences

lines = list()
for line in all_sentences:    
    words = line.split()
    for w in words: 
       lines.append(w)

## Removing Punctuation

In [None]:
import re

lines = [re.sub(r'[^A-Za-z0-9]+', '', x) for x in lines]

lines

lines2 = []

for word in lines:
    if word != '':
        lines2.append(word)

## Getting Words roots

In [None]:
#This is stemming the words to their root
from nltk.stem.snowball import SnowballStemmer

# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')

stem = []
for word in lines2:
    stem.append(s_stemmer.stem(word))

## Top Mention Keywords


In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
stem2 = []

for word in stem:
    if word not in nlp.Defaults.stop_words:
        stem2.append(word)

In [None]:
df = pd.DataFrame(stem2)
df = df[0].value_counts()

In [None]:
df = df[:20,]
px.bar(df, x=df.values,y= df.index, color=df.index, height=500)

## Top Mention Organizations

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))

In [None]:
nlp = spacy.load('en_core_web_sm') 
nlp.max_length = 2000000000000

In [None]:
str1 = " " 
stem2 = str1.join(lines2)

stem2 = nlp(stem2)

label = [(X.text, X.label_) for X in stem2.ents]

df6 = pd.DataFrame(label, columns = ['Word','Entity'])

df7 = df6.where(df6['Entity'] == 'ORG')

df7 = df7['Word'].value_counts()

In [None]:
df = df7[:20,]
plt.figure(figsize=(10,5))
px.bar(df, x=df.values,y= df.index, color=df.index, height=500)


# Top Mention People 

In [None]:
nlp = spacy.load('en_core_web_sm') 
nlp.max_length = 2000000000000

str1 = " " 
stem2 = str1.join(lines2)

stem2 = nlp(stem2)

label = [(X.text, X.label_) for X in stem2.ents]

df10 = pd.DataFrame(label, columns = ['Word','Entity'])

df10 = df10.where(df10['Entity'] == 'PERSON')

df11 = df10['Word'].value_counts()

In [None]:
df = df11[:20,]

plt.figure(figsize=(10,5))

df = df11[:20,]
plt.figure(figsize=(10,5))
px.bar(df, x=df.values,y= df.index, color=df.index, height=500)


# Sentiments Analysis

## Removing all characters from text 

In [None]:
features=tweets.values
features

In [None]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the Http: urls
    processed_feature = re.sub('(https?://\S+)', '', str(features[sentence]))
    
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', processed_feature)

    # Remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [None]:
features[:5]

In [None]:
processed_features[:5]

# Adding Subjectivity & Polarity

In [None]:
df3=pd.DataFrame()
df3['Tweets']=processed_features

In [None]:
from textblob import TextBlob
from wordcloud import WordCloud
# Create a function to get the subjectivity
def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
   return  TextBlob(text).sentiment.polarity


# Create two new columns 'Subjectivity' & 'Polarity'
df3['Subjectivity'] = df3['Tweets'].apply(getSubjectivity)
df3['Polarity'] = df3['Tweets'].apply(getPolarity)

# Show the new dataframe with columns 'Subjectivity' & 'Polarity'
df3

# Create Sentiment Analysis 

In [None]:
# Create a function to compute negative (-1), neutral (0) and positive (+1) analysis
def getAnalysis(score):
 if score < 0:
  return 'Negative'
 elif score == 0:
  return 'Neutral'
 else:
  return 'Positive'
df3['Analysis'] = df3['Polarity'].apply(getAnalysis)
df3

In [None]:
Neutral = len(df3[df3['Analysis']=='Neutral'])
Negative = len(df3[df3['Analysis']=='Negative'])
Positive = len(df3[df3['Analysis']=='Positive'])
labels = ['Negative','Positive','Neutral']
values = [Negative,Positive,Neutral]

import plotly.graph_objects as go
colors = ['darkred','green', 'darkblue' ]

fig = go.Figure(data=[go.Pie(labels=labels,
                             values=values)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,textposition='inside',
                  marker=dict(colors=colors, line=dict(color='grey', width=1)))
fig.show()

In [None]:
df_copy['sentiment'] = df3['Analysis']

plt.figure(figsize=(10,5))
sns.set(style="darkgrid")
sns.countplot(x=df_copy['acc_class'],data=df_copy,hue=df_copy['sentiment'],palette="rocket")

**Norm acc class has the highest positive and negative sentiment count compared to other acc classes which is followed by strong.**

In [None]:
sen_eng = df_copy.groupby('sentiment',as_index=False).agg({'total_engagement':'sum'})
fig = px.bar(sen_eng, x='total_engagement', y='sentiment',
             hover_data=['total_engagement'], color='total_engagement', height=400)
fig.show()

**Close to 40k of the tweets are from positive sentiments followed by neutral and negative sentiment.**

# Conclusion 

* Most of the tweets are long 
* 85% of the accounts aren't verified
* Short tweets have more hashtags and less mentions 
* Influencers account gets more engagement
* Pfizer Bio Tech has the highest count with respect to organization followed by COVID 19
* Moderna has the highest count with respect to people
* 94% of tweets have media 
* Day 12 of December most engaged day 
* People using twitter for almost 12 years now have the highest engagements of 20.752k
* Anita Anand get the Highest engagements 
* One year and three years are the most common account age 
* The USA the most tweeted Country followed by Canada
* Only 10% of tweets are negative 
* Positive and neutral tweets get high engagements 