# Topic analysis

Compare the word frequencies in one group of tweets with the word frequencies in another group of tweets to find out the topics discussed in the tweets.

In [1]:
import csv
import datetime
import math
import os
import pandas as pd
import re
import sys
from IPython.display import clear_output
from nltk.tokenize import TweetTokenizer
sys.path.append("/home/erikt/projects/newsgac/fasttext-runs")
import tscore

In [13]:
DATADIRTEXT = "/home/erikt/projects/puregome/data/text/"
DATADIRTOKENS = "../data/tokens/"
DATADIRTOKENSTOPIC = "../data/tokens-topic/"
DATEFORMAT = "%Y%m%d"
FILESUFFIX = ".csv.gz"
FREQ = "freq"
IDSTR = "id_str"
NBROFEXAMPLES = 20
QUERYTOPIC = r"corona|covid|mondkapje|rivm|blijfthuis|houvol|huisarts|flattenthecurve"
TEXT = "text"
TOKEN = "token"
TOKENFILE = "tokens.csv"
USER = "user"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)
        
def tokenize(text):
    return(TweetTokenizer().tokenize(text))

def isInterestingToken(token):
    return(re.search(r"[a-z]",token) and not re.search("http",token))

def getTokenCountsFromTweets(datePattern,query="",baseDict={}):
    tokenCounts = dict(baseDict)
    fileList = sorted(os.listdir(DATADIRTEXT))
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIRTEXT+inFileName,index_col=IDSTR)
            if query != "":
                df = df[df[TEXT].str.contains(query,case=False)]
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                text = re.sub(r"\\n"," ",text)
                text = re.sub(r"\s+"," ",text)
                for token in set(tokenize(text.lower())): 
                    if isInterestingToken(token):
                        if not token in tokenCounts: 
                            tokenCounts[token] = 0
                        tokenCounts[token] += 1
    return(tokenCounts)

def dictTopN(dictionary,N=NBROFEXAMPLES):
    return([(x[1],x[0]) for x in dictionary.items()][0:N])

In [4]:
NBROFTOKENS = "totalFreq"
NBROFTYPES = "nbrOfWords"
WORDFREQS = "wordFreqs"

def makeTscoreData(tokenList):
    data = { NBROFTOKENS:0, NBROFTYPES:0, WORDFREQS:{} }
    for token in tokenList:
        if not math.isnan(tokenList[token]):
            data[WORDFREQS][token] = tokenList[token]
            data[NBROFTYPES] += 1
            data[NBROFTOKENS] += tokenList[token]
    return(data)

def sortTscores(tscores):
    return({token:tscores[token] \
            for token in sorted(tscores.keys(),key=lambda t:tscores[t],reverse=True)})

In [5]:
def makeDateFromFileName(fileName):
    return(fileName[0:8])

def sortDict(myDict):
    return({m:myDict[m] for m in sorted(myDict.keys(),key=lambda m:myDict[m],reverse=True)})

def writeTokenCounts(tokenCounts,outFileName):
    pd.DataFrame.from_dict([tokenCounts]).T.to_csv(outFileName,index_label=TOKEN,header=[FREQ])
    
def readTokenCounts(date,dataDir=DATADIRTOKENS):
    return(pd.read_csv(dataDir+date+FILESUFFIX,index_col=TOKEN).to_dict(orient="dict")[FREQ])

def combineTokenCounts(tokenCountsList):
    if len(tokenCountsList) == 0: return({})
    else:
        tokenCountsOut = dict(tokenCountsList[0])
        for tokenCounts in tokenCountsList[1:]:
            for token in tokenCounts:
                if token in tokenCountsOut: tokenCountsOut[token] += tokenCounts2[token]
                else: tokenCountsOut[token] = tokenCounts[token]
    return(tokenCountsOut)

## Store token counts for all tweets

In [None]:
FILEPATTERN = "20200[2-9]|20201[01]"

datesSeen = {}
inFileNames = sorted(os.listdir(DATADIRTEXT))
for inFileName in inFileNames:
    if re.search(FILEPATTERN,inFileName):
        date = makeDateFromFileName(inFileName)
        outFileName = DATADIRTOKENS+date+FILESUFFIX
        if not date in datesSeen and not os.path.exists(outFileName):
            datesSeen[date] = True
            writeTokenCounts({},outFileName)
            tokenCounts = sortDict(getTokenCountsFromTweets(date))
            writeTokenCounts(tokenCounts,outFileName)

## Store token counts for tweets with topic words

In [6]:
FILEPATTERN = "20200[89]|20201[01]"

datesSeen = {}
inFileNames = sorted(os.listdir(DATADIRTEXT))
for inFileName in inFileNames:
    if re.search(FILEPATTERN,inFileName):
        date = makeDateFromFileName(inFileName)
        outFileName = DATADIRTOKENSTOPIC+date+FILESUFFIX
        if not date in datesSeen and not os.path.exists(outFileName):
            datesSeen[date] = True
            writeTokenCounts({},outFileName)
            tokenCounts = sortDict(getTokenCountsFromTweets(date,query=QUERYTOPIC))
            writeTokenCounts(tokenCounts,outFileName)

20201127-23.out.gz


## Compute topic words

In [7]:
DATADIRSENT = "../data/sentiment/pattern/"
DEFAULTFILEPATTERN = ""
SENTIMENT = "sentiment"
COUNT = "count"

def getSentimentQuery(query,filePattern=DEFAULTFILEPATTERN):
    fileList = sorted(os.listdir(DATADIRSENT))
    sentimentPerHour = {}
    for inFileName in fileList:
        if re.search(filePattern,inFileName):
            squeal(inFileName)
            try:
                dfSent = pd.read_csv(DATADIRSENT+inFileName,header=None)
                dfText = pd.read_csv(DATADIRTEXT+inFileName)
            except: continue
            dictSent = {dfSent.iloc[i][0]:dfSent.iloc[i][1] for i in range(0,len(dfSent))}
            sentScores = {}
            for i in range(0,len(dfText)):
                if re.search(query,dfText.iloc[i][TEXT],flags=re.IGNORECASE):
                    try:
                        idStr = dfText.iloc[i][IDSTR]
                        sentScore = dictSent[idStr] 
                        sentScores[idStr] = sentScore
                    except: pass
            if len(sentScores) > 0:
                sentiment = sum(sentScores.values())/len(sentScores)
                hour = inFileName[0:11]
                sentimentPerHour[hour] = { SENTIMENT:sentiment, COUNT:len(sentScores) }
    totalSentiment = sum([sentimentPerHour[hour][SENTIMENT]*sentimentPerHour[hour][COUNT] for hour in sentimentPerHour])
    totalCount = sum([sentimentPerHour[hour][COUNT] for hour in sentimentPerHour])
    if totalCount == 0: return(None)
    else: return(totalSentiment/totalCount)

In [8]:
def dateStringToDate(dateString):
    return(datetime.datetime.strptime(dateString,DATEFORMAT))

def dateToDateString(date):
    return(datetime.datetime.strftime(date,DATEFORMAT))

In [74]:
def get_tweets(date, topic, query=""):
    df = {}
    for hour in range(0,24):
        hour = str(hour).zfill(2)
        file = f"{date}-{hour}.out.gz"
        squeal(file)
        tweets = pd.read_csv(DATADIRTEXT+file, index_col="id_str")
        if query != "":
            tweets = tweets[tweets["text"].str.contains(query, case=False)]
        if topic != "":
            tweets = tweets[tweets["text"].str.contains(topic, case=False)]
        if len(tweets) > 0:
            if len(df) == 0: 
                df = tweets
            else:
                df = pd.concat([df,tweets])
    if len(df) > 0:
        groups = df.groupby(["text"]).groups
        return([(len(groups[group]),group) for group in sorted(groups, key=lambda group:len(groups[group]), reverse=True)])

20200311 & WHO declares COVID-19 to be a pandemic\\
20200312 & Schools remain open after announcing first national measures\\
20200313 & Hoarding in Dutch supermarkets\\
20200314 & People stay at home\\
20200315 & New measures: schools, bars and restaurant close\\
20200316 & Prime-minister addresses the nation\\
20200317 & Applauding event for health care workers\\
20200318 & RIVM boss Van Dissel speaks with parliament\\
20200319 & Calls for more intensive car beds\\
20200320 & King addresses the nation\\
20200321 & Calls for social distancing\\
20200322 & Calls for staying at home\\
20200323 & Tougher measures to battle the pandemic\\
20200325 & Virus detected in Dutch sewer\\
20200326 & Problems in supply of test fluid from Swiss company\\
20200327 & KLM resumes flights to COVID-19-hit countries\\
20200330 & EU provides financial support to Morocco to fight COVID-19\\
20200331 & Prolongation of COVID-19 measures\\
&\\
20200401 & Debate in the parliament about COVID-19\\
20200403 & RIVM rejects testing on Schiphol\\
20200405 & Germany declares Netherslands as risk area\\
20200407 & National press conference announces work on COVID-19 app\\
20200410 & Promising Israeli treatment\\
20200411 & Netherlands sent face masks to China in February\\
20200414 & Call for extra protection for health care workers\\
20200416 & Disturbances of youths in Monnickendam\\
20200419 & COVID-19 app proves to be unsafe\\
20200420 & Denmark put restrictions on companies asking for support\\
20200421 & Nationalpress conference\\
20200423 & Netherlands supplied face masks to Montenegro\\
20200424 & Trump recommends disinfectant as medicine\\
20200425 & China removes critical notes from WHO reports\\
20200426 & Nurse complains about lack of public compliance to measures\\
20200427 & Kings Day: calls to stay at home\\
20200428 & Complaints about number of people allowed in IKEA shops\\
20200429 & Public offers to order face masks for government\\
20200430 & Schiedam storage contains face masks to be send abroad\\
&\\
20200501 & Marseille professor cures people with hydrochloroquine\\
20200502 & Dutch company DSM is involved in producing face masks\\
20200504 & Different celebration of WWII death remembrance\\
20200505 & Different celebration of Liberation Day\\
20200506 & National press conference: face masks in public transport\\
20200508 & Incident at Rotterdam suprmarket about COVID-19 measures\\
20200509 & RIVM disapproved testing in care homes\\
20200510 & Warning for illness effects on young people\\
20200511 & Relaxation of COVID-19 measures\\
20200512 & RIVM refuses to publish reproduction number\\
20200514 & Belgian economist proposes COVID-19 tax for elderly\\
20200515 & Government party VVD profits from COVID-19 strategy in polls\\
20200516 & Government bought bad face masks\\
20200517 & RIVM has movie with medicine claims removed from YouTube\\
20200518 & Dutch doctor claims success of anti-malaria medicine\\
20200519 & National press conference\\
20200520 & COVID-19 debate in parliament\\
20200521 & Calls to stay at home, despite the weather\\
20200522 & Government thinks about postposing 2021 elections\\
20200523 & Church service contamination in Frankfurt\\
20200524 & Different celebration of Ramadan ending\\
20200525 & Unknown children's disease, possible COVID-19 link\\
20200526 & Dicussion about aerosol contamination, influence of ventilation\\
20200528 & Netherlands has more per capita COVID-19 deaths than the USA\\
20200529 & Government works on emergency law on phone location data research\\
20200530 & COVID-19 outbreak in The Hague mosque\\
20200531 & Preparing for Black Lifes Matter demonstration in Amsterdam\\
&\\
20200601 & Black Lifes Matter demonstration in Amsterdam\\
20200602 & Call to cancel all COVID-19 fines\\
20200603 & Black Lifes Matter demonstration in Rotterdam\\
20200604 & COVID-19 debate in parliament\\
20200605 & Stand-up comedian Hans Teeuwen includes COVID-19 in show\\
20200606 & Government discusses COVID-19 law\\
20200607 & Patient reports second contamination\\
20200609 & Critique on emegency COVID-19 law\\
20200610 & Dutch vaccin enters testing phase in July\\
20200611 & Protest of fancy fair workers\\
20200612 & COVID-19 causes long-term lung problems\\
20200614 & Volkskrant editor says that science, government and media need to tell same story\\
20200616 & Mayors publish manifest on effects COVID-19 on society\\
20200617 & RIVM announces that finding contamination sources is hard\\
20200619 & The Hague forbids anti-COVID-19 measures demonstration\\
20200620 & NRC newspaper publishes timeline of Dutch pandemic experiences\\
20200621 & The Hague demonstration against COVID-19 measures still held\\
20200622 & RIVM forbids ventilator usage in health care homes\\
20200623 & Parliament forbidsbreading animals sentsitive for COVID-19\\
20200624 & National press conference\\
20200625 & COVID-19 debate in parliament\\
20200627 & Call to stop social distancing rule\\
20200628 & Poll reveals disapprovement (87%) of governments COVID-19 policies\\
20200629 & DUtch Railways cuts 2,300 jobs\\
20200630 & Government parties vote against increasing healt care worker salaries\\
&\\
20200702 & Doctors demand motivations for national COVID-19 measures\\
20200705 & Nurse Boy Eddema dies from COVID-19\\
20200706 & People demonstrate against incorrect COVID-19 news\\
20200708 & National COVID-19 app is named: CoronaMelder
20200709 & Medical staff protests against national pandemic measures\\
20200710 & Dordrecht forbids demosntration against COVID-19 law\\
20200711 & Health minister forced doctor to give satisfying intensive care bed estimate\\
20200712 & French busdriver killed in incident about face masks\\
20200713 & 23rd Dutch mink farm with COVID-19 found\\
20200714 & Mob profited from COVID-19 relief fund for companies\\
20200715 & Calls for testing even with mild symptoms\\
20200717 & 38 Schiphol-bound flights contained COVID-19 patients\\
20200719 & No passenger checks in Schiphol flights from contaminated areas\\
20200722 & Calls for more face mask wearing\\
20200723 & Discussion about achievability of face mask obligation\\
20200725 & Despite reports, three-year old did not die of COVID-19\\
20200726 & Doctor protests against national COVID-19 measures\\
20200727 & Scar tissue found on hearts of COVID-19 patients\\
20200728 & Calls for keeping following the COVID-19 measures\\
20200729 & CNS reports that excess deaths is double of COVID-19 deaths\\
20200730 & Amsterdam and Rotterdam ask people in busy areas to wear face masks\\ 

In [113]:
MONTH = "202011"

for day in range(21,31):
    TARGETDATE = MONTH+str(day).zfill(2)
    PREVIOUSDATE = dateToDateString(dateStringToDate(TARGETDATE)+datetime.timedelta(days=-1))

    tokenCounts1 = readTokenCounts(PREVIOUSDATE,dataDir=DATADIRTOKENSTOPIC)
    tokenCounts2 = readTokenCounts(TARGETDATE,dataDir=DATADIRTOKENSTOPIC)
    tscores1 = makeTscoreData(tokenCounts1)
    tscores2 = makeTscoreData(tokenCounts2)
    print(f"\n{TARGETDATE}")
    for item in dictTopN(sortTscores(tscore.computeTscore(tscores2,tscores1))):
        print(item)


20201121
(15.902292753272638, 'vertrouw')
(15.76260005988869, 'afgepakt')
(15.756971947098165, 'lichaamstaal')
(15.679972106854247, 'verraadt')
(15.557911617218917, 'testen')
(15.44568002212619, 'vaccin')
(15.188240303060786, 'ongeacht')
(15.180082567299596, 'geloof')
(14.990857894301922, '#rutte')
(14.99073004672204, 'omlaag')
(14.060196772440339, 'feestdagen')
(13.692286497509627, 'intochten')
(13.173566396768472, 'proces')
(13.156659955993591, 'zo')
(12.975520458239512, 'snel')
(12.82523144900849, 'demonstreren')
(12.670309768397468, 'veiligheid')
(12.600127432612426, 'gaan')
(12.535583122335126, 'kozp')
(12.471881348480695, 'duurt')

20201122
(24.0770072185415, '@rivm')
(22.474543287748, 'technische')
(22.419171579825914, 'richtlijn')
(22.374154508438924, 'verhoogd')
(22.218822333010575, 'pcr-test')
(21.594247368574674, 'weigert')
(21.38255352432723, 'cycli')
(21.169250551537164, 'targets')
(21.098799108501343, 'verlaagd')
(20.81980284166435, '#pcrgate')
(20.324240739167646, 'antw

In [116]:
get_tweets("20201123", "astra", QUERYTOPIC)

20201123-23.out.gz


[(62,
  'Mensen opgelet, zojuist vernomen op #UKcolumn, de Britse EMA die zich moet uitspreken over de toelating van het nieuwe Astrazeneca vaccin tegen Covid19, bevat een aantal mensen in de board die aandeelhouder zijn bij Astrazeneca. Hier gaan we weer: WIJ VAN WC-EEND... Wie zitter'),
 (24,
  '#AstraZeneca #coronavaccin , bezint eer ge begint, er lopen nogal wat rechtszaken en zij zijn niet aansprakelijk voor Uw gezondheidsschade! https://t.co/UtEri8Cr2H'),
 (17,
  'Het #coronavaccin dat door de Brits-Zweedse farmaceut #AstraZeneca wordt ontwikkeld, is voor 70 tot 90 procent effectief.\\n\\nHmm... de overlevingskans van #corona is 99 procent! 🤔\U0001f928\\n\\nGeen vaccin nodig dus...\\n https://t.co/B0Ke4KAPz9'),
 (9,
  '123-NEWS/TEAM:\\n@LidwienNews @dekoran1 @rinsjan \\n\\n#SpecialReport:\\n@MuzZ11_ @ynaleling @Wiep13396680 \\n\\nHÉLAAS PINDAKAAS 😡\\nDoor Nederland besteld #coronavirus vaccin AstraZeneca voor 30% t/m 10% onbetrouwbaar op korte termijn https://t.co/YEhikopsU1 http

In [None]:
import time

scores = {}
for token in "stand speech zaken".split():
    scores[token] = getSentimentQuery(token,filePattern=TARGETDATE+"-23")
    print(scores)
    time.sleep(5)

## Old code

In [None]:
DATEPATTERN = "2020031[1245]"
QUERY = "corona|covid|flattenthecurve|blijfthuis|rivm|mondkapje|huisarts|houvol|zorg"
NBROFEXAMPLES = 10

def getExamples(datePattern,query1=QUERY,query2=""):
    fileList = sorted(os.listdir(DATADIR))
    tweets = {}
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            clear_output(wait=True)
            print(inFileName)
            df = pd.read_csv(DATADIR+inFileName,compression="gzip",index_col=ID)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query1,text) and re.search(query2,text):
                    if text in tweets: tweets[text] += 1
                    else: tweets[text] = 1
    return({tweet:tweets[tweet] for tweet in sorted(tweets.keys(),key=lambda t:tweets[t],reverse=True)})

def getTokenCountsFromTweets(datePattern,query=""):
    tokenCounts = {}
    fileList = sorted(os.listdir(DATADIR))
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIR+inFileName,index_col=IDSTR)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query,text):
                    for token in set(tokenize(text.lower())): 
                        if re.search(r"[a-z]",token):
                            if not token in tokenCounts: tokenCounts[token] = 0
                            tokenCounts[token] += 1
    return(tokenCounts)

def writeData(data,fileName):
    pd.DataFrame(data).to_csv(fileName,index_label=TOKEN)
    
def readData(fileName):
    return(pd.read_csv(fileName,index_col=TOKEN).to_dict())

def dictTopN(dictionary,N=NBROFEXAMPLES):
    return([(x[1],x[0]) for x in dictionary.items()][0:N])

In [None]:
#tokens = readData(TOKENFILE)
for month in "202003 202004 202005".split():
    print(month)
    tokens = makeData(month)
    writeData(tokens,"tokens"+month+".csv")

In [None]:
tscoreData = {}
for date in tokens:
    tscoreData[date] = makeTscoreData(tokens[date])

In [None]:
dictTopN(sortTscores(tscore.computeTscore(tscoreData["20200312"],tscoreData["20200311"])),N=20)

In [None]:
dictTopN(sortTscores(tscore.computeTscore(tscoreData["20200315"],tscoreData["20200314"])),N=20)

## Analysis

There is a clear impact of the national press conferences on the topic tweets of the two volume peak dates. On 20200312, both the event (*persconferentie*) and the speakers (*rutte* and *kabinet*) are present in the top 20 words selected by the tscore measure. The most important topic in the tweets was the dicussion about school closures (*scholen*, *kinderen*, *onderwijs*, *sluiten*, *ouders* and *onderwijspersoneel*). On 20200315 the main topic is the closure of bars and restaurants (*horeca*). 

In [None]:
dictTopN(getExamples("20200312",query1=QUERY,query2="scholen"))

In [None]:
dictTopN(getExamples("20200313",query1=QUERY,query2="scholen"))