In [120]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import csv
from math import sqrt
from matplotlib import rc,rcParams
import re
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy 
import numpy as np
import emoji
from langdetect import detect
from matplotlib.pyplot import figure

In [131]:
#!pip install emoji

In [132]:
#!pip install langdetect

**Pfizer Tweet Data begins on 2020-12-12 and (available data) ends on 2021-11-14. Pfizer Stock Data is available from 1972-06-01 and ends on 2021-11-12. To ensure that we are looking at the same timeframe for both datasets, the Pfizer Stock Data was filtered to dates between 2020-12-12 to 2021-11-12, and Pfizer Tweet Data was filtered to be from 2020-12-12 to 2021-11-12.**

In [235]:
pfizer_tweets = pd.read_csv("vaccination_tweets.csv")

In [236]:
pfizer_tweets = pfizer_tweets[~(pfizer_tweets['date'] > '2021-11-13')]
#pfizer_tweets

In [237]:
pfizer_stock_data = pd.read_csv("PFE.csv")
#pfizer_stock_data

In [253]:
pfizer_stock_data['Date'] = pd.to_datetime(pfizer_stock_data['Date'])

pfizer_stock_data = pfizer_stock_data[~(pfizer_stock_data['Date'] < '2020-12-12')]
pfizer_stock_data = pfizer_stock_data[~(pfizer_stock_data['Date'] > '2021-11-12')]
pfizer_stocks_data = pfizer_stock_data.reset_index()
pfizer_stocks_data

Unnamed: 0,index,Date,Open,High,Low,Close,Adj Close,Volume
0,12241,2020-12-14,41.619999,41.660000,39.070000,39.209999,37.733589,94809700
1,12242,2020-12-15,39.060001,39.180000,38.209999,38.709999,37.252419,65712800
2,12243,2020-12-16,38.180000,38.470001,37.740002,37.840000,36.415180,56515300
3,12244,2020-12-17,37.830002,38.119999,37.310001,38.029999,36.598022,52036400
4,12245,2020-12-18,37.990002,38.090000,37.500000,37.680000,36.261200,60259200
...,...,...,...,...,...,...,...,...
227,12468,2021-11-08,48.610001,48.790001,47.599998,48.330002,48.330002,57423300
228,12469,2021-11-09,48.330002,48.380001,47.150002,47.299999,47.299999,29920400
229,12470,2021-11-10,47.400002,49.209999,47.400002,49.020000,49.020000,42696200
230,12471,2021-11-11,49.230000,50.500000,48.730000,50.180000,50.180000,42370400


## Tweet Preprocessing

*Not necessary to remove non-English tweets from the dataframe as Vader can analyze non-English tweets, and non-English speaking individuals can tweet about the vaccine AND participate in the stock market*.

**Converting all emojis in tweets to text**

In [239]:
#!pip install vaderSentiment

In [240]:
#Insert String
#Return String

sentim_analyzer = SentimentIntensityAnalyzer()

def emoji_to_text(text):
    edicts = emoji.emoji_lis(text)
    
    for e in edicts:
        replacement = " "+emoji.demojize(e["emoji"]).replace(":","").replace("_"," ")
        text=text.replace(e["emoji"],replacement)
    return text

In [241]:
#Test for check_english_tweets
tweet_emoji = 'Catch utf-8 emoji such as 💘 and 💋 and 😁'

print(emoji_to_text(tweet_emoji))

Catch utf-8 emoji such as  heart with arrow and  kiss mark and  beaming face with smiling eyes


In [242]:
pfizer_tweets['text'] = pfizer_tweets['text'].apply(emoji_to_text)

**Cleaning tweets of mentions, URL links, and special characters because this not meaningful data for the Vader Sentiment Analyzer**

In [243]:
#Insert two strings
#String

def remove_pattern(text, pattern):
    r = re.findall(pattern, text)
    for item in r:
        text = re.sub(item, '', text)        
    return text

In [244]:
#Test for remove_pattern
print(remove_pattern("I do not like @United_Airlines", r'@United_Airlines'))

I do not like 


In [245]:
#Insert String
#Return String
def clean_tweets(tweet):
    #remove twitter handles (@xxx)
    tweet = remove_pattern(tweet, "@[\w]*")
    
    #remove URL links (httpxxx)
    tweet = remove_pattern(tweet, "https?://[A-Za-z0-9./]*")
    
    #remove special characters, numbers, punctuations (except for #)
    tweet = re.sub("[^a-zA-Z]", " ", tweet)
    
    return tweet

In [246]:
#Test for clean_tweets
print(clean_tweets('COVID Vaccine Updates, @POTUS: https://www.whitehouse.gov/ ***'))

COVID Vaccine Updates        


In [247]:
pfizer_tweets['text'] = pfizer_tweets['text'].apply(clean_tweets)

**Finding polarity score of each tweet and extracting the polarity score to put in a column**

In [248]:
#Insert a String
#Returns a Float
def extract_compound_score(text):
    polarity_scores = sentim_analyzer.polarity_scores(text)
    compound_score = polarity_scores['compound']
    return compound_score

In [249]:
#Test for extract_compound_score
print(extract_compound_score('Catch utf-8 emoji such as  heart with arrow and  kiss mark and  beaming face with smiling eyes'))

0.7003


In [250]:
pfizer_tweets['compound_score'] = pfizer_tweets['text'].apply(extract_compound_score)

In [251]:
pfizer_tweets.loc[10945]['text']

' PfizerBioNTech booster today   syringeThankful '

In [234]:
#DO NOT RUN AGAIN
pfizer_tweets.loc[10945]['text']

'#PfizerBioNTech booster today. 💉Thankful.'

In [233]:
sentim_analyzer.polarity_scores('PfizerBioNTech booster today. 💉Thankful.')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [252]:
sentim_analyzer.polarity_scores(' PfizerBioNTech booster today   syringeThankful ')

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [224]:
pfizer_tweets[pfizer_tweets['compound_score'] == 0.0000]

Unnamed: 0,id,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,retweets,favorites,is_retweet,compound_score
3,1337855739918835717,Charles Adler,"Vancouver, BC - Canada","Hosting ""CharlesAdlerTonight"" Global News Radi...",2008-09-10 11:28:53,49165,3933,21853,True,2020-12-12 20:23:59,Facts are immutable Senator even when you re...,,Twitter Web App,446,2129,False,0.0
4,1337854064604966912,Citizen News Channel,,Citizen News Channel bringing you an alternati...,2020-04-23 17:58:42,152,580,1473,False,2020-12-12 20:17:19,Explain to me again why we need a vaccine w...,"['whereareallthesickpeople', 'PfizerBioNTech']",Twitter for iPhone,0,0,False,0.0
8,1337850023531347969,Erin Despas,,Designing&selling on Teespring. Like 90s Disne...,2009-10-30 17:53:54,887,1515,9639,False,2020-12-12 20:01:16,Covid vaccine You getting it CovidVaccine...,"['CovidVaccine', 'covid19', 'PfizerBioNTech', ...",Twitter Web App,2,1,False,0.0
9,1337842295857623042,Ch.Amjad Ali,Islamabad,#ProudPakistani #LovePakArmy #PMIK @insafiansp...,2012-11-12 04:18:12,671,2368,20469,False,2020-12-12 19:30:33,CovidVaccine States will start getting COV...,"['CovidVaccine', 'COVID19Vaccine', 'US', 'paku...",Twitter Web App,0,0,False,0.0
10,1337841934170255365,Tamer Yazar,Turkey-Israel,"Im Market Analyst, also Editor... working (fre...",2009-09-17 16:45:16,1302,78,339,False,2020-12-12 19:29:07,while deaths are closing in on the mar...,"['PfizerBioNTech', 'Vaccine']",Twitter Web App,0,0,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10942,1458987047495032832,𝑯𝒐𝒘𝒂𝒓𝒅 𝑪𝒉𝒖 🇨🇦,Canada 🇨🇦,Nothing is too mundane in life.,2009-04-19 17:11:03,241,691,8353,False,2021-11-12 02:36:33,Not the case for the doses Moderna mRNA ...,['Moderna'],Twitter for Android,0,1,False,0.0
10944,1458953029726744578,Arnold V Ancheta,"Maryland, USA",Tabungao,2011-03-17 07:25:12,301,1990,21874,False,2021-11-12 00:21:22,Calix Ancheta COVID Vaccines Pfizer BioNTec...,"['Pfizer', 'pfizer1stdose', 'PfizerBioNTech', ...",Twitter for iPhone,0,0,False,0.0
10945,1458943411080163355,Carla 🇨🇦 Roncato,Earth,Data. Analytics. AI. Privacy. Identity. Security.,2014-09-25 07:11:48,229,304,1072,False,2021-11-11 23:43:09,PfizerBioNTech booster today syringeThankful,['PfizerBioNTech'],Twitter for iPhone,0,2,False,0.0
10947,1458932114418716676,Joseph Harrison,UK,"I blog about my life, feeling a bit 'robbed' b...",2012-04-06 13:51:19,4415,4715,38480,False,2021-11-11 22:58:16,BLOG I M JUST GOING FOR A WALK PART...,"['CovidVaccine', 'PfizerBionTech']",Twitter Web App,1,0,False,0.0
