In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import spacy
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from string import punctuation
import collections
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import en_core_web_sm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shahzehan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shahzehan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [41]:
#Jan2020

In [24]:
df = pd.read_csv("..\\processed\\tweets\\Jan2020.csv", names=('text', 'label'))
df.head()

Unnamed: 0,text,label
0,Fast action will be key to containing new coro...,0
1,That CoronaVirus is about to turn into that Ri...,1
2,It be the ones who have the worst hygiene that...,1
3,This has been a day:\n\n• No new witnesses: ht...,1
4,y’all @itslbern really thinks i have the coron...,1


In [25]:
# remove the hashtags, mentions and unwanted characters.
def clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower() # Coverting to lowercase
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"[^A-Za-z0-9\s]+", "", elem)) # Removing special characters and punctuation
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"https?://\S+", "", elem)) # Removing links
    return df

df = clean_text(df, 'text')
df.head()

Unnamed: 0,text,label
0,fast action will be key to containing new coro...,0
1,that coronavirus is about to turn into that ri...,1
2,it be the ones who have the worst hygiene that...,1
3,this has been a day\n\n no new witnesses https...,1
4,yall itslbern really thinks i have the coronav...,1


In [26]:
nlp = en_core_web_sm.load() 
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation) #already taken care of with the cleaning function.
stop.update(punctuation)
w_tokenizer = WhitespaceTokenizer()

def furnished(text):
    final_text = []
    for i in w_tokenizer.tokenize(text):
        if i.lower() not in stop:
            word = lemmatizer.lemmatize(i)
            final_text.append(word.lower())
    return " ".join(final_text)

df.text = df.text.apply(furnished)
df.head()

Unnamed: 0,text,label
0,fast action key containing new coronavirus chi...,0
1,coronavirus turn riversidevirus real quick smh,1
2,one worst hygiene stay talkin coronavirus http...,1
3,day new witness httpstcodxbl1hmhjc coronavirus...,1
4,yall itslbern really think coronavirus,1


In [27]:
vaccine_words = '''antibiotic antibody antipoison antiseptic antiserum antivenin counteractant counteragent medicine neutralizer preventive serum 
vaccine booster booster dose booster injection immunization recall dose booster dose dram enema inoculation needle vaccine'''

mask_words = '''mask visor dust mask fencing mask gas mask iron mask oxygen mask ski mask protective mask safety goggles welding goggles
welding mask eye mask surgical mask goggles face shield kn95 kn'''

symptom_words = '''affection ailment complaint complex diagnostics disorder infirmity malady problem prognostics sickness sign symptoms  symptom 
temperature cough smell taste fever flu cold infection'''

quarantine_words = '''block off close off confine insulate island keep apart part quarantine seclude segregate separate sequester sever sunder 
isolate isolation aloneness confinement desolation detachment exile quarantine reclusion reclusiveness remoteness retreat seclusiveness segregation 
sequestration solitude withdrawal'''

lockdown_words = '''isolation confinement solitary confinement holding cell hole lockdown solitaries self-observer anomic hermit pokey anchorite
rogue elephant recluse wallflower lone hand stylite loner bullpen non-conformist jailhouse friar eremite pillarist jail troglodyte brooder gaol 
solitarian monk'''

education_words = '''education school schools academy academies university universities college colleges gce highers sat exams examination a-level
a-levels o-levels o-level gcse finals class classroom teacher teachers student students'''

treatment_words = '''aid alleviation antidote assistance catholicon corrective counteractant counteragent countermeasure drug elixir elixir vitae 
fix healing healing agent help medicament medicant medicine nostrum panacea pharmacon physic placebo proprietary quick fix recovery redress remedy
reparation restorative therapeutic treatment cure'''

science_words = '''analyst analyzer clinician experimenter investigator scientist tester scientists biologist biologists clinicians physician science 
biology study studies research researcher researchers'''

statistics_words = '''abstracts compilations conclusions documents dope dossier evidence experiments facts figures goods info input materials measurements 
memorandums notes picture proof reports results scoop score statistics testimony deaths cases mortality rate'''

health_words = '''dispensary hospital infirmary sick bay surgery center dispensaries hospitals infirmaries surgery centers nurse nurses doctor doctors nhs 
who health n.h.s w.h.o'''

economy_words = '''artisan collar breadwinner company craftsperson employee laborer trader tradesperson wage earner collar working employees business 
businessmen shops market stores store markets clerk economy economic profit loss inflation industry industrial agriculture farmer price prices gdp 
credit trading trade investment cost'''

legislation_words = '''administering agency application authority charge command conduct conducting control directing direction dispensation disposition 
distribution enforcement execution governing government guidance handling jurisdiction legislation order organization overseeing oversight performance
policy power provision regulation rule running strategy superintendence supervision surveillance police policies regulation'''

politics_words = '''Uncle Sam Washington authority bureaucracy command control direction domination dominion empire execution executive governance guidance 
influence jurisdiction law ministry patronage political politics presidency regency regime rule sovereignty state feds union government politics democrats 
democrat democratic republic republican snp tory labour president prime congress parliment congressmen congressman mp'''

travel_words = '''travel international airline airlines flight flights hotel hotels train trains bus vacation holiday traveler travelers cruise 
border borders emirates etihad airways ocean sea'''

testing_words = '''case cases positive negative test testing centre false tested tests rate'''


mask = furnished(mask_words)
vaccine = furnished(vaccine_words)
symptom = furnished(symptom_words)
quarantine = furnished(quarantine_words)
lockdown = furnished(lockdown_words)
education = furnished(education_words)
treatment = furnished(treatment_words)
science = furnished(science_words)
statistics = furnished(statistics_words)
health = furnished(health_words)
economy = furnished(economy_words)
legislation = furnished(legislation_words)
politics = furnished(politics_words)
travel = furnished(travel_words)
testing = furnished(testing_words)

In [28]:
string1 = mask
words = string1.split()
mask = " ".join(sorted(set(words), key=words.index))

string1 = vaccine
words = string1.split()
vaccine = " ".join(sorted(set(words), key=words.index))

string1 = symptom
words = string1.split()
symptom = " ".join(sorted(set(words), key=words.index))

string1 = quarantine
words = string1.split()
quarantine = " ".join(sorted(set(words), key=words.index))

string1 = lockdown
words = string1.split()
lockdown = " ".join(sorted(set(words), key=words.index))

string1 = education
words = string1.split()
education = " ".join(sorted(set(words), key=words.index))

string1 = treatment
words = string1.split()
treatment = " ".join(sorted(set(words), key=words.index))

string1 = science
words = string1.split()
science = " ".join(sorted(set(words), key=words.index))

string1 = statistics
words = string1.split()
statistics = " ".join(sorted(set(words), key=words.index))

string1 = health
words = string1.split()
health = " ".join(sorted(set(words), key=words.index))

string1 = economy
words = string1.split()
economy = " ".join(sorted(set(words), key=words.index))

string1 = legislation
words = string1.split()
legislation = " ".join(sorted(set(words), key=words.index))

string1 = politics
words = string1.split()
politics = " ".join(sorted(set(words), key=words.index))

string1 = travel
words = string1.split()
travel = " ".join(sorted(set(words), key=words.index))

string1 = testing
words = string1.split()
testing = " ".join(sorted(set(words), key=words.index))

In [29]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def get_scores(group,tweets):
    scores = []
    for tweet in tweets:
        s = jaccard_similarity(group, tweet)
        scores.append(s)
    return scores

In [30]:
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

In [31]:
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores
df.head(10)

Unnamed: 0,text,label,Masks,Vaccine,Symptoms,Quarantine,Lockdown,Education,Treatment,Science,Statistics,Health,Economy,Legislation,Politics,Travel,Testing
0,fast action key containing new coronavirus chi...,0,0.777778,0.62963,0.703704,0.740741,0.714286,0.62963,0.740741,0.666667,0.666667,0.703704,0.769231,0.714286,0.678571,0.692308,0.625
1,coronavirus turn riversidevirus real quick smh,1,0.653846,0.625,0.708333,0.75,0.653846,0.695652,0.75,0.666667,0.666667,0.64,0.708333,0.592593,0.615385,0.695652,0.619048
2,one worst hygiene stay talkin coronavirus http...,1,0.740741,0.72,0.8,0.84,0.807692,0.653846,0.84,0.76,0.76,0.8,0.875,0.740741,0.769231,0.791667,0.583333
3,day new witness httpstcodxbl1hmhjc coronavirus...,1,0.727273,0.65625,0.71875,0.69697,0.727273,0.606061,0.69697,0.6875,0.6875,0.666667,0.71875,0.78125,0.75,0.65625,0.5
4,yall itslbern really think coronavirus,1,0.615385,0.652174,0.73913,0.64,0.68,0.652174,0.708333,0.695652,0.695652,0.666667,0.73913,0.615385,0.64,0.727273,0.571429
5,want something battle coronavirus either drop ...,1,0.741935,0.612903,0.677419,0.709677,0.6875,0.5625,0.65625,0.645161,0.645161,0.677419,0.733333,0.6875,0.65625,0.666667,0.551724
6,carnival cruise ship coronavirus scare trap 70...,1,0.62069,0.72,0.730769,0.642857,0.678571,0.653846,0.703704,0.76,0.76,0.607143,0.666667,0.740741,0.769231,0.653846,0.583333
7,due spread coronavirus one day cafe event supp...,1,0.785714,0.703704,0.777778,0.75,0.851852,0.703704,0.75,0.678571,0.740741,0.777778,0.846154,0.785714,0.814815,0.769231,0.64
8,starbucks shuts half china store coronavirus o...,1,0.678571,0.592593,0.730769,0.84,0.678571,0.592593,0.769231,0.692308,0.692308,0.666667,0.730769,0.740741,0.703704,0.653846,0.583333
9,according blog 6 case coronavirus infection qi...,1,0.758621,0.678571,0.75,0.851852,0.7,0.678571,0.785714,0.777778,0.777778,0.689655,0.75,0.821429,0.785714,0.740741,0.615385


In [33]:
df['Masks'] = pd.to_numeric(df['Masks'])
df['Vaccine'] = pd.to_numeric(df['Vaccine'])
df['Symptoms'] = pd.to_numeric(df['Symptoms'])
df['Quarantine'] = pd.to_numeric(df['Quarantine'])
df['Lockdown'] = pd.to_numeric(df['Lockdown'])
df['Education'] = pd.to_numeric(df['Education'])
df['Treatment'] = pd.to_numeric(df['Treatment'])
df['Science'] = pd.to_numeric(df['Science'])
df['Statistics'] = pd.to_numeric(df['Statistics'])
df['Health'] = pd.to_numeric(df['Health'])
df['Economy'] = pd.to_numeric(df['Economy'])
df['Legislation'] = pd.to_numeric(df['Legislation'])
df['Politics'] = pd.to_numeric(df['Politics'])
df['Travel'] = pd.to_numeric(df['Travel'])
df['Testing'] = pd.to_numeric(df['Testing'])

In [36]:
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)
df.head(10)

Unnamed: 0,text,label,Masks,Vaccine,Symptoms,Quarantine,Lockdown,Education,Treatment,Science,Statistics,Health,Economy,Legislation,Politics,Travel,Testing,Topic
0,fast action key containing new coronavirus chi...,0,0.777778,0.62963,0.703704,0.740741,0.714286,0.62963,0.740741,0.666667,0.666667,0.703704,0.769231,0.714286,0.678571,0.692308,0.625,Masks
1,coronavirus turn riversidevirus real quick smh,1,0.653846,0.625,0.708333,0.75,0.653846,0.695652,0.75,0.666667,0.666667,0.64,0.708333,0.592593,0.615385,0.695652,0.619048,Quarantine
2,one worst hygiene stay talkin coronavirus http...,1,0.740741,0.72,0.8,0.84,0.807692,0.653846,0.84,0.76,0.76,0.8,0.875,0.740741,0.769231,0.791667,0.583333,Economy
3,day new witness httpstcodxbl1hmhjc coronavirus...,1,0.727273,0.65625,0.71875,0.69697,0.727273,0.606061,0.69697,0.6875,0.6875,0.666667,0.71875,0.78125,0.75,0.65625,0.5,Legislation
4,yall itslbern really think coronavirus,1,0.615385,0.652174,0.73913,0.64,0.68,0.652174,0.708333,0.695652,0.695652,0.666667,0.73913,0.615385,0.64,0.727273,0.571429,Symptoms
5,want something battle coronavirus either drop ...,1,0.741935,0.612903,0.677419,0.709677,0.6875,0.5625,0.65625,0.645161,0.645161,0.677419,0.733333,0.6875,0.65625,0.666667,0.551724,Masks
6,carnival cruise ship coronavirus scare trap 70...,1,0.62069,0.72,0.730769,0.642857,0.678571,0.653846,0.703704,0.76,0.76,0.607143,0.666667,0.740741,0.769231,0.653846,0.583333,Politics
7,due spread coronavirus one day cafe event supp...,1,0.785714,0.703704,0.777778,0.75,0.851852,0.703704,0.75,0.678571,0.740741,0.777778,0.846154,0.785714,0.814815,0.769231,0.64,Lockdown
8,starbucks shuts half china store coronavirus o...,1,0.678571,0.592593,0.730769,0.84,0.678571,0.592593,0.769231,0.692308,0.692308,0.666667,0.730769,0.740741,0.703704,0.653846,0.583333,Quarantine
9,according blog 6 case coronavirus infection qi...,1,0.758621,0.678571,0.75,0.851852,0.7,0.678571,0.785714,0.777778,0.777778,0.689655,0.75,0.821429,0.785714,0.740741,0.615385,Quarantine


In [37]:
Jan2020 = pd.read_csv("..\\processed\\tweets\\Jan2020.csv", names=('text', 'label'))
Jan2020['topic'] = df['Topic']
Jan2020.head(10)


Unnamed: 0,text,label,topic
0,Fast action will be key to containing new coro...,0,Masks
1,That CoronaVirus is about to turn into that Ri...,1,Quarantine
2,It be the ones who have the worst hygiene that...,1,Economy
3,This has been a day:\n\n• No new witnesses: ht...,1,Legislation
4,y’all @itslbern really thinks i have the coron...,1,Symptoms
5,Want to do something to battle the #coronaviru...,1,Masks
6,#Carnival #Cruise Ship: #Coronavirus Scare Tra...,1,Politics
7,"Due to the spread of the coronavirus, the one ...",1,Lockdown
8,Starbucks shuts half of China stores on corona...,1,Quarantine
9,According this blog: 6 cases of #coronavirus i...,1,Quarantine


In [40]:
Jan2020.to_csv('..\\processed\\tweets\\Jan2020.csv', index = False, header = False)

In [42]:
#Apr2020

In [43]:
# Load the tweets
df = pd.read_csv("..\\processed\\tweets\\Apr2020.csv", names=('text', 'label'))

# Clean the text
df = clean_text(df, 'text')
df.text = df.text.apply(furnished)

# Get scores for the topics
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

# Add the scores to the dataframe
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores

# Find the topic of the tweets via their scores
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)

# Save the topics to the original file
Apr2020 = pd.read_csv("..\\processed\\tweets\\Apr2020.csv", names=('text', 'label'))
Apr2020['topic'] = df['Topic']
Apr2020.to_csv('..\\processed\\tweets\\Apr2020.csv', index = False, header = False)

In [44]:
#Jul2020

In [45]:
# Load the tweets
df = pd.read_csv("..\\processed\\tweets\\Jul2020.csv", names=('text', 'label'))

# Clean the text
df = clean_text(df, 'text')
df.text = df.text.apply(furnished)

# Get scores for the topics
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

# Add the scores to the dataframe
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores

# Find the topic of the tweets via their scores
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)

# Save the topics to the original file
Jul2020 = pd.read_csv("..\\processed\\tweets\\Jul2020.csv", names=('text', 'label'))
Jul2020['topic'] = df['Topic']
Jul2020.to_csv('..\\processed\\tweets\\Jul2020.csv', index = False, header = False)

In [46]:
#Oct2020

In [47]:
# Load the tweets
df = pd.read_csv("..\\processed\\tweets\\Oct2020.csv", names=('text', 'label'))

# Clean the text
df = clean_text(df, 'text')
df.text = df.text.apply(furnished)

# Get scores for the topics
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

# Add the scores to the dataframe
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores

# Find the topic of the tweets via their scores
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)

# Save the topics to the original file
Oct2020 = pd.read_csv("..\\processed\\tweets\\Oct2020.csv", names=('text', 'label'))
Oct2020['topic'] = df['Topic']
Oct2020.to_csv('..\\processed\\tweets\\Oct2020.csv', index = False, header = False)

In [48]:
#Jan2021

In [49]:
# Load the tweets
df = pd.read_csv("..\\processed\\tweets\\Jan2021.csv", names=('text', 'label'))

# Clean the text
df = clean_text(df, 'text')
df.text = df.text.apply(furnished)

# Get scores for the topics
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

# Add the scores to the dataframe
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores

# Find the topic of the tweets via their scores
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)

# Save the topics to the original file
Jan2021 = pd.read_csv("..\\processed\\tweets\\Jan2021.csv", names=('text', 'label'))
Jan2021['topic'] = df['Topic']
Jan2021.to_csv('..\\processed\\tweets\\Jan2021.csv', index = False, header = False)

In [None]:
#Apr2021

In [50]:
# Load the tweets
df = pd.read_csv("..\\processed\\tweets\\Apr2021.csv", names=('text', 'label'))

# Clean the text
df = clean_text(df, 'text')
df.text = df.text.apply(furnished)

# Get scores for the topics
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

# Add the scores to the dataframe
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores

# Find the topic of the tweets via their scores
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)

# Save the topics to the original file
Apr2021 = pd.read_csv("..\\processed\\tweets\\Apr2021.csv", names=('text', 'label'))
Apr2021['topic'] = df['Topic']
Apr2021.to_csv('..\\processed\\tweets\\Apr2021.csv', index = False, header = False)

In [None]:
#Jul2021

In [51]:
# Load the tweets
df = pd.read_csv("..\\processed\\tweets\\Jul2021.csv", names=('text', 'label'))

# Clean the text
df = clean_text(df, 'text')
df.text = df.text.apply(furnished)

# Get scores for the topics
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

# Add the scores to the dataframe
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores

# Find the topic of the tweets via their scores
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)

# Save the topics to the original file
Jul2021 = pd.read_csv("..\\processed\\tweets\\Jul2021.csv", names=('text', 'label'))
Jul2021['topic'] = df['Topic']
Jul2021.to_csv('..\\processed\\tweets\\Jul2021.csv', index = False, header = False)

In [None]:
#Oct2021

In [52]:
# Load the tweets
df = pd.read_csv("..\\processed\\tweets\\Oct2021.csv", names=('text', 'label'))

# Clean the text
df = clean_text(df, 'text')
df.text = df.text.apply(furnished)

# Get scores for the topics
mask_scores = get_scores(mask, df.text.to_list())
vaccine_scores = get_scores(vaccine, df.text.to_list())
symptom_scores = get_scores(symptom, df.text.to_list())
quarantine_scores = get_scores(quarantine, df.text.to_list())
lockdown_scores = get_scores(lockdown, df.text.to_list())
education_scores = get_scores(education, df.text.to_list())
science_scores = get_scores(science, df.text.to_list())
statistics_scores = get_scores(statistics, df.text.to_list())
health_scores = get_scores(health, df.text.to_list())
economy_scores = get_scores(economy, df.text.to_list())
legislation_scores = get_scores(legislation, df.text.to_list())
politics_scores = get_scores(politics, df.text.to_list())
travel_scores = get_scores(travel, df.text.to_list())
testing_scores = get_scores(testing, df.text.to_list())
treatment_scores = get_scores(treatment, df.text.to_list())

# Add the scores to the dataframe
df['Masks'] = mask_scores
df['Vaccine'] = vaccine_scores
df['Symptoms'] = symptom_scores
df['Quarantine'] = quarantine_scores
df['Lockdown'] = lockdown_scores
df['Education'] = education_scores
df['Treatment'] = treatment_scores
df['Science'] = science_scores
df['Statistics'] = statistics_scores
df['Health'] = health_scores
df['Economy'] = economy_scores
df['Legislation'] = legislation_scores
df['Politics'] = politics_scores
df['Travel'] = travel_scores
df['Testing'] = testing_scores

# Find the topic of the tweets via their scores
df['Topic'] = df[['Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing']].idxmax(axis=1)

# Save the topics to the original file
Oct2021 = pd.read_csv("..\\processed\\tweets\\Oct2021.csv", names=('text', 'label'))
Oct2021['topic'] = df['Topic']
Oct2021.to_csv('..\\processed\\tweets\\Oct2021.csv', index = False, header = False)

In [55]:
#Total number of tweets per topic per time period

def topic_sum(df_in, df_out, time):
    v1 = df_in['topic'].value_counts()['Masks']
    v2 = df_in['topic'].value_counts()['Vaccine']
    v3 = df_in['topic'].value_counts()['Symptoms']
    v4 = df_in['topic'].value_counts()['Quarantine']
    v5 = df_in['topic'].value_counts()['Lockdown']
    v6 = df_in['topic'].value_counts()['Education']
    v7 = df_in['topic'].value_counts()['Treatment']
    v8 = df_in['topic'].value_counts()['Science']
    v9 = df_in['topic'].value_counts()['Statistics']
    v10 = df_in['topic'].value_counts()['Health']
    v11 = df_in['topic'].value_counts()['Economy']
    v12 = df_in['topic'].value_counts()['Legislation']
    v13 = df_in['topic'].value_counts()['Politics']
    v14 = df_in['topic'].value_counts()['Travel']
    v15 = df_in['topic'].value_counts()['Testing']
    df_out.loc[len(df_out.index)] = [time, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15]
    return df_out

df_ts = pd.DataFrame(columns=['Time Period', 'Masks', 'Vaccine', 'Symptoms', 'Quarantine', 'Lockdown', 'Education', 'Treatment', 'Science', 'Statistics', 'Health', 'Economy', 'Legislation', 'Politics', 'Travel', 'Testing'])

df_ts = topic_sum(Jan2020, df_ts, "Jan2020")
df_ts = topic_sum(Apr2020, df_ts, "Apr2020")
df_ts = topic_sum(Jul2020, df_ts, "Jul2020")
df_ts = topic_sum(Oct2020, df_ts, "Oct2020")
df_ts = topic_sum(Jan2021, df_ts, "Jan2021")
df_ts = topic_sum(Apr2021, df_ts, "Apr2021")
df_ts = topic_sum(Jul2021, df_ts, "Jul2021")
df_ts = topic_sum(Oct2021, df_ts, "Oct2021")

df_ts.head(8)

Unnamed: 0,Time Period,Masks,Vaccine,Symptoms,Quarantine,Lockdown,Education,Treatment,Science,Statistics,Health,Economy,Legislation,Politics,Travel,Testing
0,Jan2020,10153,2049,10555,12887,10632,793,4505,5782,2312,1203,23595,7914,7645,2247,1629
1,Apr2020,24346,2004,7748,13338,12255,179,6033,6349,2184,2199,17200,8946,9524,532,1249
2,Jul2020,26928,2080,7662,13292,12113,147,6062,6516,2193,2287,17054,9100,10017,510,1212
3,Oct2020,27729,2236,7529,12974,12131,162,5853,6875,2290,2261,17291,8707,10236,528,1444
4,Jan2021,31459,2180,7353,13938,12424,147,6565,7131,2290,2074,16831,10218,10663,639,1567
5,Apr2021,34860,2064,6901,13967,12506,147,6954,7393,2306,1874,16077,10642,11250,512,1493
6,Jul2021,37395,1590,5308,10816,14876,108,5114,5364,2126,1545,12516,12083,18061,412,1109
7,Oct2021,39117,2042,7270,14741,13285,175,7244,7297,2391,1932,16720,11836,11784,552,1415


In [56]:
df_ts.to_csv("..\\processed\\topicNum.csv", index = False)