# Topic analysis

Compare words in tweets from volume peak dates with words from tweets from other dates to find out which topics triggered the volume peaks. 

In [1]:
import csv
import math
import os
import pandas as pd
import re
import sys
from IPython.display import clear_output
from nltk.tokenize import TweetTokenizer
sys.path.append("/home/erikt/projects/newsgac/fasttext-runs")
import tscore

In [2]:
DATADIR = "/home/erikt/projects/puregome/data/text/"
ID = "id_str"
TEXT = "text"
TOKEN = "token"
TOKENFILE = "tokens.csv"
USER = "user"

In [3]:
DATEPATTERN = "2020031[1245]"
QUERY = "corona|covid|flattenthecurve|blijfthuis|rivm|mondkapje|huisarts|houvol|zorg"
NBROFEXAMPLES = 10

def getExamples(datePattern,query1=QUERY,query2=""):
    fileList = sorted(os.listdir(DATADIR))
    tweets = {}
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            clear_output(wait=True)
            print(inFileName)
            df = pd.read_csv(DATADIR+inFileName,compression="gzip",index_col=ID)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query1,text) and re.search(query2,text):
                    if text in tweets: tweets[text] += 1
                    else: tweets[text] = 1
    return({tweet:tweets[tweet] for tweet in sorted(tweets.keys(),key=lambda t:tweets[t],reverse=True)})

def makeData(datePattern,query=QUERY):
    fileList = sorted(os.listdir(DATADIR))
    tokens = {}
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            clear_output(wait=True)
            print(inFileName)
            date = inFileName[0:8]
            if not date in tokens: tokens[date] = {}
            df = pd.read_csv(DATADIR+inFileName,compression="gzip",index_col=ID)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query,text):
                    for token in TweetTokenizer().tokenize(text.lower()): 
                        if re.search(r"[a-z]",token):
                            if not token in tokens[date]: tokens[date][token] = 0
                            tokens[date][token] += 1
    return(tokens)

def writeData(data,fileName):
    pd.DataFrame(data).to_csv(fileName,index_label=TOKEN)
    
def readData(fileName):
    return(pd.read_csv(fileName,index_col=TOKEN).to_dict())

def dictTopN(dictionary,N=NBROFEXAMPLES):
    return([(x[1],x[0]) for x in dictionary.items()][0:N])

In [6]:
#tokens = readData(TOKENFILE)
for month in "202003 202004 202005".split():
    print(month)
    tokens = makeData(month)
    writeData(tokens,"tokens"+month+".csv")

20200525-18.out.gz


In [64]:
NBROFTOKENS = "totalFreq"
NBROFTYPES = "nbrOfWords"
WORDFREQS = "wordFreqs"

def makeTscoreData(tokenList):
    data = { NBROFTOKENS:0, NBROFTYPES:0, WORDFREQS:{} }
    for token in tokenList:
        if not math.isnan(tokenList[token]):
            data[WORDFREQS][token] = tokenList[token]
            data[NBROFTYPES] += 1
            data[NBROFTOKENS] += tokenList[token]
    return(data)

def sortTscores(tscores):
    return({token:tscores[token] for token in sorted(tscores.keys(),key=lambda t:tscores[t],reverse=True)})

In [59]:
tscoreData = {}
for date in tokens:
    tscoreData[date] = makeTscoreData(tokens[date])

In [89]:
dictTopN(sortTscores(tscore.computeTscore(tscoreData["20200312"],tscoreData["20200311"])),N=20)

[(91.60442025764918, '#coronadebat'),
 (78.97741163967773, '#covid_19'),
 (65.5129728502599, 'scholen'),
 (58.05318808343848, 'rutte'),
 (45.94871916226014, 'kinderen'),
 (42.2056255747426, 'kabinet'),
 (41.77929963688309, 'maatregelen'),
 (40.13852933881494, 'trump'),
 (38.934780261956206, 'onderwijs'),
 (38.143346719667484, 'sluiten'),
 (36.15941845414344, 'ouders'),
 (34.67443174873152, '#persconferentie'),
 (32.30460846375055, 'schouders'),
 (31.94561264372927, 'vanavond'),
 (31.443044262387314, 'persconferentie'),
 (30.64989800427607, '#coronahulp'),
 (30.451036528043275, 'vvd'),
 (30.242371033416283, 'blijven'),
 (30.154322917933154, 'onderwijspersoneel'),
 (30.12977279038019, '#coronavirusnederland')]

In [90]:
dictTopN(sortTscores(tscore.computeTscore(tscoreData["20200315"],tscoreData["20200314"])),N=20)

[(56.10373202523538, '#coronavirusnl'),
 (51.27541386261536, '16u'),
 (43.506825050815195, 'horeca'),
 (42.68413740392834, '#covid19nl'),
 (42.666377863812585, 'genomen'),
 (42.130617286003975, '#lockdownnl'),
 (41.27300848805268, 'april'),
 (38.706021588851996, 'morgenavond'),
 (37.41640267762352, 'nbelgen'),
 (36.594544941735116, 'aanvullende'),
 (36.34116105725093, 'fuiven'),
 (36.13170079804637, 'pletter'),
 (35.97562853910354, 'beginnen'),
 (35.90013496104832, 'uitlachen'),
 (35.520520050898924, 'morgenvroeg'),
 (35.489863712482006, '9u'),
 (35.448115484855634, 'maatregelen'),
 (34.7973055808141, 'negatief'),
 (34.72120740646297, 'ndat'),
 (34.30839614997719, '@mohamedouaamari')]

## Analysis

There is a clear impact of the national press conferences on the topic tweets of the two volume peak dates. On 20200312, both the event (*persconferentie*) and the speakers (*rutte* and *kabinet*) are present in the top 20 words selected by the tscore measure. The most important topic in the tweets was the dicussion about school closures (*scholen*, *kinderen*, *onderwijs*, *sluiten*, *ouders* and *onderwijspersoneel*). On 20200315 the main topic is the closure of bars and restaurants (*horeca*). 

In [None]:
dictTopN(getExamples("20200312",query1=QUERY,query2="scholen"))

In [None]:
dictTopN(getExamples("20200313",query1=QUERY,query2="scholen"))