In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from wordcloud import WordCloud,STOPWORDS

plt.rc('figure',figsize=(17,13))
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots

In [None]:
!pip install vaderSentiment
!pip install twython

In [None]:
data = pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv')
data.head()

In [None]:
def clean(text):
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('<.*?>+', '', text)
    return text
    


data['text'] = data['text'].apply(lambda x:clean(x))
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
scores=[]
for i in range(len(data['text'])):
    
    score = analyser.polarity_scores(data['text'][i])
    score=score['compound']
    scores.append(score)
sentiment=[]
for i in scores:
    if i>=0.05:
        sentiment.append('Positive')
    elif i<=(-0.05):
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data['sentiment']=pd.Series(np.array(sentiment))

In [None]:
data.describe()

In [None]:
text = ",".join(review for review in data.text if 'COVID' not in review and 'https' not in review and 'Covid' not in review)
wordcloud = WordCloud(max_words=200, colormap='Set2',background_color="black").generate(text)
plt.figure(figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.figure(1,figsize=(12, 12))
plt.title('Prevalent words in tweets',fontsize=19)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(x ="user_verified",data=data, palette="Set1")
plt.title("Verified user accounts or not")
plt.xticks([False,True],['Unverified','Verified'])
plt.show()

In [None]:
tags=data['hashtags'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(8,8))
explode = (0, 0.1, 0, 0,0.01) 
tags[0:5].plot(kind = 'pie',title = 'Top 5 hashtags',autopct='%1.1f%%',shadow=True,explode = explode)

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.drop(columns=['id','is_retweet']).corr(), square=True, annot=True)
plt.show()

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text
data['text'] = data['text'].apply(lambda x:clean_text(x))

In [None]:
data['text']

In [None]:
df=pd.DataFrame()
df['text']=data['text']
def tokenization(text):
    text = re.split('\W+', text)
    return text
df['tokenized'] = df['text'].apply(lambda x: tokenization(x.lower()))
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text
    
df['No_stopwords'] = df['tokenized'].apply(lambda x: remove_stopwords(x))
ps = nltk.PorterStemmer()
def stemming1(text):
    text = [ps.stem(word) for word in text]
    return text
df['stemmed_porter'] = df['No_stopwords'].apply(lambda x: stemming1(x))
from nltk.stem.snowball import SnowballStemmer
s_stemmer = SnowballStemmer(language='english')
def stemming2(text):
    text = [s_stemmer.stem(word) for word in text]
    return text
df['stemmed_snowball'] = df['No_stopwords'].apply(lambda x: stemming2(x))
wn = nltk.WordNetLemmatizer()
def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

df['lemmatized'] = df['No_stopwords'].apply(lambda x: lemmatizer(x))

In [None]:
df.head()

In [None]:
data['text']=df['lemmatized']
data.head()

In [None]:
temp = data.groupby('sentiment').count()['text'].reset_index().sort_values(by='text',ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=data)
fig = go.Figure(go.Funnelarea(
    text =temp.sentiment,
    values = temp.text,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()

In [None]:
all_words=[]
for i in range(len(data['text'])):
    a=data['text'][i]
    for i in a:
        all_words.append(i)
all_words=pd.Series(np.array(all_words))

common_words=all_words.value_counts()[:30].rename_axis('Common Words').reset_index(name='count')

fig = px.treemap(common_words, path=['Common Words'], values='count',title='30 Most Common Words In Tweets')
fig.show()

In [None]:
data['hashtags']=data['hashtags'].fillna('[]')
all_hashtags=[]
for i in range(len(data['hashtags'])):
    a=data['hashtags'][i].strip('][').split(', ') 
    for i in a:
        all_hashtags.append(i)
all_hashtags=['No Hashtag' if x=='' else x for x in all_hashtags]       

all_hashtags=pd.Series(np.array(all_hashtags))
print('There are {} instances of tweets in which No Hashtags were used'.format(all_hashtags.value_counts()[1]))

common_hashtags=all_hashtags.value_counts().drop(labels='No Hashtag')[:30].rename_axis('Common Hashtags').reset_index(name='count')
fig = px.treemap(common_hashtags, path=['Common Hashtags'], values='count',title='30 Most Common Hashtags')
fig.show()

In [None]:
data_ = data['source'].value_counts().reset_index()

trace1 = go.Bar(
                x = ['Twitter for Android', 'Twitter Web App', 'Twitter for iPhone',
       'TweetDeck', 'Buffer', 'Twitter for iPad', 'Twitter Media Studio',
       'ThreadReaderApp', 'Instagram', 'SocialFlow', 'Hootsuite Inc.',
       'LinkedIn', 'Twitter for Mac', '24liveblog', 'Publer ', 'IFTTT',
       'Socialbakers', 'Falcon Social Media Management ', 'Echobox',
       'Microsoft Power Platform', 'Nonli', 'Sendible',
       'Tweetbot for Mac', 'EastMojo',
       'Twitter Media Studio - LiveCut'], #temp_df['index'],
                y = data_['source'],
                marker = dict(color = 'rgb(250,13,92)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text=data_['source'], textposition='outside')
layout = go.Layout(template= "plotly_dark",title = 'SOURCE DISTRIBUTION OF TWEETS' , xaxis = dict(title = 'SOURCE'), yaxis = dict(title = 'Count'), height=650)
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

In [None]:
data['user_location'] = data['user_location'].fillna('NaN')
Positive_tweet = data[data['sentiment']=='Positive'].reset_index()
Negative_tweet = data[data['sentiment']=='Negative'].reset_index()
Neutral_tweet = data[data['sentiment']=='Neutral'].reset_index()
pos_location=Positive_tweet['user_location']
neg_location=Negative_tweet['user_location']
neu_location=Neutral_tweet['user_location']

common=set(pos_location).intersection(set(neg_location)).intersection(set(neu_location))
common_list=list(common)

common_words=neg_location.value_counts().drop(labels=common_list)[:10].rename_axis('Common Negative Locations').reset_index(name='count')
fig = px.treemap(common_words, path=['Common Negative Locations'], values='count',title='10 Top Unique Negative Tweets Locations')
fig.show()
common_words=pos_location.value_counts().drop(labels=common_list)[:10].rename_axis('Common Positive Locations').reset_index(name='count')
fig = px.treemap(common_words, path=['Common Positive Locations'], values='count',title='10 Top Unique Positive Tweets Locations')
fig.show()
common_words=neu_location.value_counts().drop(labels=common_list)[:10].rename_axis('Common Neutral Locations').reset_index(name='count')
fig = px.treemap(common_words, path=['Common Neutral Locations'], values='count',title='10 Top Unique Neutral Tweets Locations')
fig.show()


In [None]:
data["date"] = pd.to_datetime(data.date) 
timeline = data.resample('D', on='date')["sentiment"].value_counts().unstack(1)

timeline.reset_index(inplace=True)

timeline = timeline.melt("date", var_name='sentiment',  value_name='vals')

sns.set_style("whitegrid")
sns.lineplot(x="date", y="vals", hue="sentiment", data=timeline, palette=["r", "g","b"])
plt.figure(figsize=(40,10))