# Hashtags

In [1]:
from nltk.tokenize import TweetTokenizer
import os
import pandas as pd
import re
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from IPython.display import clear_output

In [2]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

In [3]:
DATADIR = "../data/text/"
ID_STR = "id_str"
TEXT = "text"
TOPICQUERY = "corona|covid|huisarts|mondkapje|rivm|blijfthuis|flattenthecurve|houvol"
PANDEMICQUERY = "|".join([TOPICQUERY, r'virus|besmet|ziekenhui|\bic\b|intensive.care|^zorg|vaccin|[^ad]arts|uitbraak|uitbrak|pandemie|ggd|'+
                                      r'mondkapje|quarantaine|\bwho\b|avondklok|variant|verple|sympto|e.golf|mutant|^omt$|umc|hcq|'+
                                      r'hydroxychloroquine|virolo|zkh|oversterfte|patiënt|patient|intensivist|🦠|ivermectin'])
DISTANCEQUERY = "1[.,]5[ -]*m|afstand.*hou|hou.*afstand|anderhalve[ -]*meter"
LOCKDOWNQUERY = "lock.down|lockdown"
VACCINQUERY = "vaccin|ingeënt|ingeent|inent|prik|spuit|bijwerking|-->|💉|pfizer|moderna|astrazeneca|astra|zeneca|novavax|biontech"
TESTQUERY = r'\btest|getest|sneltest|pcr'
QUERY = "|".join([PANDEMICQUERY, TESTQUERY, VACCINQUERY, LOCKDOWNQUERY, DISTANCEQUERY])
BASEQUERY = "corona|covid"
HAPPY_QUERY = r'\b(geluk|gelukkig|gelukkige|blij|happy)\b'
LONELY_QUERY = r'eenza|alleen.*voel|voel.*alleen|lonely|loneli'
IK_QUERY = r'\b(ik|mij|mijn|me|mn|m\'n|zelf|mezelf|mijzelf|i)\b'

In [4]:
def get_tweets(file_pattern, query, query2="", spy=False):
    tweets = []
    file_names = sorted(os.listdir(DATADIR))
    for file_name in file_names:
        if re.search('^' + file_pattern, file_name):
            if spy:
                squeal(file_name)
            df = pd.read_csv(DATADIR+file_name,index_col=ID_STR)
            if query2 == "":
                df_query = df[df[TEXT].str.contains(query, flags=re.IGNORECASE)]
            else:
                df_query = df[df[TEXT].str.contains(query, flags=re.IGNORECASE) & df[TEXT].str.contains(query2, flags=re.IGNORECASE)]
            tweets.extend(list(df_query[TEXT]))
    return(tweets)

In [5]:
def get_hashtags(tweet):
    hashtags = []
    for token in TweetTokenizer().tokenize(tweet):
        if re.search(r'#', token):
            hashtags.append(token)
    return(hashtags)

In [6]:
def process_month(month, query=BASEQUERY, query2=""):
    tweets = [re.sub(r'\\n', ' ', tweet) for tweet in get_tweets(month, query, query2=query2, spy=False)]
    hashtags = {}
    for tweet in tweets:
        if re.search(r'#', tweet):
            for hashtag in get_hashtags(tweet):
                if hashtag in hashtags:
                    hashtags[hashtag] += 1
                else:
                    hashtags[hashtag] = 1
    print(month, " ".join([hashtag for hashtag in sorted(hashtags.keys(), key=lambda hashtag:hashtags[hashtag], reverse=True)][:200]))
    return hashtags

In [7]:
pd.DataFrame([{"202105": "measures", "202106": "pandemic", "202107": "measures", 
               "202108": "pandemic", "202109": "entry pass", "202110": "entry pass"},
              {"202105": "pandemic", "202106": "measures", "202107": "pandemic", 
               "202108": "measures", "202109": "measures", "202110": "measures"},
              {"202105": "vaccination", "202106": "vaccination", "202107": "vaccination", 
               "202108": "vaccination obligation", "202109": "vaccination obligation", "202110": "pandemic"},
              {"202105": "entry pass", "202106": "FVD", "202107": "vaccination obligation", 
               "202108": "vaccination", "202109": "pandemic", "202110": "vaccination obligation"},
              {"202105": "Netherlands", "202106": "Netherlands", "202107": "FVD", 
               "202108": "Netherlands", "202109": "press conference", "202110": "press conference"},
              {"202105": "testing", "202106": "facemasks", "202107": "Netherlands", 
               "202108": "lockdown", "202109": "FVD", "202110": "unvaccinated"},
              {"202105": "FVD", "202106": "entry pass", "202107": "lockdown", 
               "202108": "press conference", "202109": "Hugo de Jonge", "202110": "3 October protest"},
              {"202105": "ivermectine", "202106": "app", "202107": "press conference", 
               "202108": "entry pass", "202109": "hospitality business", "202110": "Netherlands"},
              {"202105": "long covid", "202106": "variants", "202107": "long covid", 
               "202108": "FVD", "202109": "Mark Rutte", "202110": "Hugo de Jonge"},
              {"202105": "lockdown", "202106": "lab leak", "202107": "Hugo de Jonge", 
               "202108": "long covid", "202109": "Mona Keizer", "202110": "FVD"},
             ])

Unnamed: 0,202105,202106,202107,202108,202109,202110
0,measures,pandemic,measures,pandemic,entry pass,entry pass
1,pandemic,measures,pandemic,measures,measures,measures
2,vaccination,vaccination,vaccination,vaccination obligation,vaccination obligation,pandemic
3,entry pass,FVD,vaccination obligation,vaccination,pandemic,vaccination obligation
4,Netherlands,Netherlands,FVD,Netherlands,press conference,press conference
5,testing,facemasks,Netherlands,lockdown,FVD,unvaccinated
6,FVD,entry pass,lockdown,press conference,Hugo de Jonge,3 October protest
7,ivermectine,app,press conference,entry pass,hospitality business,Netherlands
8,long covid,variants,long covid,FVD,Mark Rutte,Hugo de Jonge
9,lockdown,lab leak,Hugo de Jonge,long covid,Mona Keizer,FVD


In [22]:
def save_hashtags(hashtags, month):
    hashtags_sorted = { hashtag: hashtags[hashtag] 
                        for hashtag in sorted(hashtags.keys(), key=lambda hashtag:hashtags[hashtag], reverse=True) }
    data_file = open(f"csv/hashtags_{month}.txt","w")
    for hashtag_data in list(hashtags_sorted.items())[0:200]:
        print(hashtag_data[1], hashtag_data[0], file=data_file)
    data_file.close()

In [25]:
for month in "202003 202004 202005 202006 202007 202008 202009 202010 202011 202012".split():
    hashtags = process_month(month)
    save_hashtags(hashtags, month)

202003 #coronavirus #corona #COVID19 #coronavirusNederland #Corona #coronanederland #Coronavirusnl #COVID19NL #coronadebat #CoronaCrisis #Covid_19 #coronacrisis #COVIDー19 #Coronavirus #COVID2019NL #covid19 #COVID2019 #lockdown #RIVM #coronahulp #PVV #blijfthuis #Rutte #covid19Nederland #samentegencorona #FVD #EU #COVID19BE #Wilders #coronanl #hamsteren #rivm #coronavirusnl #Covid19 #SocialDistancing #scholendicht #coronalied #dtv #coronavirusnetherlands #zorg #op1 #lockdownnl #Nieuwsuur #coronavirusnederland #covid19nl #COVID-19 #groepsimmuniteit #Nederland #persconferentie #Coronavid19 #samensterk #houdafstand # #thuiswerken #ikblijfthuis #CoronaVirusUpdate #Baudet #Coronalul #Op1 #CoronaVirus #StayAtHome #CoronaPandemie #jinek #applausvoordezorg #SocialDistance #Jinek #Rutte3 #mondkapjes #rutte #onderwijs #COVID19Belgium #vrtnws #VVD #Italië #covid_19 #blijfbinnen #NEXIT #buitenhof #quarantaine #CoronaVirusUpdates #coronalul #nieuwsuur #virus #terzaketv #Coronacrisis #covid #dwdd #Co

In [10]:
for month in "202105 202106 202107 202108 202109 202110".split():
    process_month(month)

202105 #coronamaatregelen #coronavirus #COVID19 #corona #coronavaccin #Corona #coronapaspoort #COVID19NL #covid19nl #testsamenleving #vaccinatiepaspoort #vaccinatie #covid19 #Vaccinatie #covid #coronadebat #FVD #testmaatschappij #coronavaccinatie #ikweiger #Ivermectine #Covid_19 #vaccineren #vaccin #Covid19 #LongCovid #lockdown #vaccinatieplicht #dtv #testbewijzen #coronahoax #coronacrisis #vaccinaties #persconferentie #RIVM #versoepelingen # #Covid #Rutte #stopdelockdown #Baudet #mondkapjes #vrijheid #CoronaCrisis #Eurovision #nederland #klaarmetRutte #COVID #Nieuwsuur #londonprotest #vaccins #hugodejongekanniks #volksgezondheid #testwet #AstraZeneca #plandemie #Wilders #Wuhan #WuhanLab #Pfizer #IkPrikHetNiet #debat #zorg #India #nederlands #mondkapjesplicht #nederlandsnews #WHO #fvd #peilingen #peiling #politiegeweld #rutte #WEF #OMT #wappie #vanangstnaarvertrouwen #ZeroCovid #vaccinatiebewijs #TeHoog #hugodejonge #worldwidefreedomrally #aerosolen #EU #PVV #bevrijdingsdag #longcovid 

In [11]:
for month in "202105".split():
    tweets = [re.sub(r'\\n', ' ', tweet) for tweet in get_tweets(month, LONELY_QUERY, query2=IK_QUERY, spy=False)]
    hashtags = {}
    for tweet in tweets:
        if re.search(r'#', tweet):
            for hashtag in get_hashtags(tweet):
                if hashtag in hashtags:
                    hashtags[hashtag] += 1
                else:
                    hashtags[hashtag] = 1
    print(month, " ".join([hashtag for hashtag in sorted(hashtags.keys(), key=lambda hashtag:hashtags[hashtag], reverse=True)][:200]))

202105 #maatjegezocht #eenzamejongeren #vrijheid #walgelijk #keeponmoving #durftevragen #eenzaamheid #dodenherdenking #hemels #Walgelijk #geile #kanker #bloomforjulie #mondkapje #rouw #AstraZeneca #Eurovision #liefde # #KunstvanhetSamenleven #Citaten #poems #poetry #Vaccinatie #mondneusmasker #faceshield #Jurgen #brommeropzee #eenzaam #wevergetenbekeniet #brabantmaatjes #corona #Arnhem #Baudet #GoVegan #nieuweburen #buuf #nieuwhuis #wereldibddag #breakthesilence #crohn #colitis #kiplekker #Janssen #Bedankt #WaarisWaldy #nieuwsenco #kutkaag #LonelyFans #moederdag #deugonderwijs #ikvaccineerniet #ikvaccineer #deverraders #Pfizer #zelf #bewonersinitiatief #buurthuis #ABCD #deochtend #leeftijdsdiscriminatie #Grondwet #esf21 #jurgenconings #NowPlaying #Corona #COVID19 #fvd #vaccinatiepaspoort #TolkenTerugInDeZorg #Stichting #songfestival #wandelenmetIddo #Eurovision2021 #samentegeneenzaamheid #vraaghetonzewetenschappers #vandaag #Vites #alleenwinsttelt #Mottenzedan #goedemorgen #NPO2 #Eenza

In [20]:
for month in "202002 202003 202004 202005 202006 202007 202008 202009 202010 202011 202012 201201".split():
    tweets = [re.sub(r'\\n', ' ', tweet) for tweet in get_tweets(month, BASEQUERY, spy=False)]
    hashtags = {}
    for tweet in tweets:
        if re.search(r'#', tweet):
            for hashtag in get_hashtags(tweet):
                if hashtag in hashtags:
                    hashtags[hashtag] += 1
                else:
                    hashtags[hashtag] = 1
    print(month, " ".join([hashtag for hashtag in sorted(hashtags.keys(), key=lambda hashtag:hashtags[hashtag], reverse=True)][:200]))

202002 #coronavirus #corona #Coronavirus #COVID19 #coronavragen #Corona #COVID2019 #COVIDー19 #coronavirusNederland #CoronavirusOutbreak #virus #covid19 #RIVM #China #Coronavirius #FVD #Iran #CoronaVirus #Wuhan #EU #Covid_19 #ncov #COVID #patiënt #schengen #coronavirusus #Tilburg #Covid19 #Coronavius #CoronaVirusUpdates #Nederland #Italie #rivm #covid #dtv #PVV #Italië #china #Nieuwsuur #wuhan #WHO #sarscov2 #jinek #vrtnws #CoronaOutbreak #quarantaine #op1 #handenwassen #nieuws #coronavavirus #dwdd #Jinek #Erdogan #tilburg #coronaviruschina #coronarovirus #coronacrisis #boeren #nepnieuws #nieuwsuur #SARSCoV2 # #GGD #GoogleAlerts #griep #fakenews #COVID_19 #COVID19NL #Coronarivus #opengrenzen #terzaketv #Schengengrenscode #stikstofcrisis #Coronavid19 #waarschuwing #WhatsApp #nepbericht #telegraafpremium #nos #NOS #WuhanVirus #Baudet #coronavrees #Op1 #corona-virus #Rusland #NWO #Europa #vragen #FvD #Diemen #nosjournaal #antwoorden #F1 #vtmnieuws #Westerdam #coronavirus-besmetting #AEX #c

In [7]:
for month in "202101 202102 202103 202104".split():
    tweets = [re.sub(r'\\n', ' ', tweet) for tweet in get_tweets(month, BASEQUERY, spy=False)]
    hashtags = {}
    for tweet in tweets:
        if re.search(r'#', tweet):
            for hashtag in get_hashtags(tweet):
                if hashtag in hashtags:
                    hashtags[hashtag] += 1
                else:
                    hashtags[hashtag] = 1
    print(month, " ".join([hashtag for hashtag in sorted(hashtags.keys(), key=lambda hashtag:hashtags[hashtag], reverse=True)][:200]))

202101 #coronamaatregelen #coronavirus #corona #avondklok #coronadebat #COVID19 #lockdown #Corona #FVD #coronavaccin #COVID19NL #coronaprotest #Rutte #covid19nl #vaccin #covid19 #avondklokrellen #persconferentie #vaccinatie #Covid19 #coronabeleid #Vaccinatie #rellen #museumplein #avondklokdebat #covid #stopdelockdown #Eindhoven #coronacrisis #toeslagenaffaire #Covid_19 #PVV #vaccinatiestrategie #Amsterdam #AlleenSamen #AvondklokProtest #dtv #CoronaCrisis #politie #Wilders #vaccineren #vrijheid #lockdown2021 #ikwildieprik #VVD #jinek #COVID #op1 #RIVM #OMT #coronarellen # #zorg #kabinet #rutte #mondkapjes #hugodejonge #Nederland #Urk #TheGreatReset #StemNederlandTerug #peiling #virus #Op1npo #ikdoenietmeermee #StemZeWeg #Ivermectine #samentegencorona #Nieuwsuur #coronavaccinatie #coronazwendel #opendebat #coronavirusnl #COVIDー19 #amsterdam #Pfizer #Coronavirus #Avondklok #D66 #coronanl #Vaccin #peilingen #ZeroCovid #onderwijs #blijfthuis #IkPrikHetNiet #LongCovid #vaccins #nosjournaal #