In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Imports and Installs

In [None]:
# Imports
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import re
from wordcloud import WordCloud, STOPWORDS
import ast
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from IPython.display import Image


# Read Data

In [None]:
tweet_df = pd.read_csv('/kaggle/input/pfizer-vaccine-tweets/vaccination_tweets.csv')


In [None]:
# Datetime Conversion
tweet_df["date"] =pd.to_datetime(tweet_df["date"]).dt.date

tweet_df["ctext"] =tweet_df.text.str.lower()

#Remove twitter handlers
tweet_df.ctext = tweet_df.ctext.apply(lambda x:re.sub('@[^\s]+','',x))

#remove hashtags
tweet_df.ctext = tweet_df.ctext.apply(lambda x:re.sub(r'\B#\S+','',x))


# Remove URLS
tweet_df.ctext = tweet_df.ctext.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all the special characters
tweet_df.ctext = tweet_df.ctext.apply(lambda x:' '.join(re.findall(r'\w+', x)))

#remove all single characters
tweet_df.ctext = tweet_df.ctext.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

# Substituting multiple spaces with single space
tweet_df.ctext = tweet_df.ctext.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

tweet_df.sort_values("date", inplace=True)
tweet_df.head()

# Data Visualization


In [None]:
date_wise_count = tweet_df.groupby("date").agg({
    "text":"count"
}).sort_values("date").reset_index()
plt.rcParams["figure.figsize"] = (15,5)
plt.plot(date_wise_count["date"],date_wise_count["text"],'--X', color='green')
plt.legend(["Number of Tweets"])
plt.title("Frequency of Tweets")
plt.show()

In [None]:
locationWise = tweet_df.user_location.value_counts().plot(title='Location with most Tweets')

### Popular Hashtags

In [None]:
hashtags = []
for tags in tweet_df.hashtags.unique():
    if not(tags is np.nan):
        for _ in ast.literal_eval(tags):
            hashtags.append(_)
text =''
for _ in hashtags:
    text+=" "+_

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    plt.figure(figsize=(40, 30))
    plt.title("Popular HashTags")
    plt.imshow(wordcloud) 
    plt.axis("off");


In [None]:
# Generate word cloud and Plot
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(text)
plot_cloud(wordcloud)

### Popular @'s

In [None]:
# All "@'s"
at = []
for text in tweet_df.text.unique():
    text = re.findall('@[^\s]+',text )
    for _ in text:
        at.append(_)

ats = ""
for _ in at:
    ats+=_+" "

In [None]:
# Generate word cloud and Plot
wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = STOPWORDS).generate(ats)
plot_cloud(wordcloud)

## Sentiment Analysis

In [None]:
sid = SentimentIntensityAnalyzer()
def sentiments(sentence):
    ss = sid.polarity_scores(sentence)
    return (ss["pos"], ss["neg"])
tweet_df["sentiments"] =tweet_df["ctext"].apply(lambda x: sentiments(x))
tweet_df["text_blob_sentiment"] =tweet_df["ctext"].apply(lambda x: TextBlob(x).sentiment.polarity) 
tweet_df["pos_sentiments"] =tweet_df["sentiments"].apply(lambda x: float(x[0]))
tweet_df["neg_sentiments"] =tweet_df["sentiments"].apply(lambda x: float(x[1]))

In [None]:
gtf=tweet_df.groupby("date").agg({
    "pos_sentiments": ["max","mean"],
    "neg_sentiments": ["min","mean"],
    "text_blob_sentiment":"mean"
}).reset_index()
gtf.columns= ["date","ps_max","ps_mean", "ns_min","ns_mean","tb_senti_mean"]

In [None]:
plt.plot(gtf["date"],gtf["ps_mean"], 'green' )
plt.plot(gtf["date"],gtf["ns_mean"], 'red' )
plt.legend(["Positive Sentiment","Negative Sentiment"])
plt.title("Positive and Negative Sentiments of Tweets")
plt.show()

In [None]:
plt.plot(gtf["date"],gtf["ps_mean"]-gtf["ns_mean"], "blue")
plt.plot(gtf["date"],gtf["tb_senti_mean"], 'green')
plt.legend(["Vader Avg Sentiment", "Text Blob Avg Sentiment"])
plt.title("Average Sentiments of Tweets By TextBlob and Vader")
plt.show()

Two series **Sentiment By Vader** and **Sentiment By TextBlob** are almost parallel. We may improve accuracy of Sentiments by **Training BERT Sentiment classifier** or any any other Neural Network Model. Leaving it for future improvements. 

## **What happend after 15 jan ?**
Below news might be the reason for Negative Sentiments on Pfizer Vaccine
!Image("../input/newscreenshot/NewsScreenshot.png")


In [None]:
Image("../input/newscreenshot/NewsScreenshot.png")

# Fetch some information from Tweets

In [None]:
!pip install transformers==3.0 

In [None]:
# Getting a model
from transformers import BertTokenizer
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
from transformers import BertTokenizer

In [None]:
query="Is Covid Vaccine Work "
query_embedding = tokenizer.encode(query)

In [None]:
embeddings = tokenizer.encode(tweet_df['ctext'])

In [None]:
top_k=5
cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
cos_scores = cos_scores.cpu()

#We use torch.topk to find the highest 5 scores
top_results = torch.topk(cos_scores, k=top_k)

print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")

for score, idx in zip(top_results[0], top_results[1]):
    print(data['text'].values[idx], "(Score: %.4f)" % (score))

In [None]:
list(tweet_df['ctext'])