# Topic analysis

Compare the word frequencies in one group of tweets with the word frequencies in another group of tweets to find out the topics discussed in the tweets.

In [1]:
import csv
import datetime
import math
import os
import pandas as pd
import re
import sys
from IPython.display import clear_output
from nltk.tokenize import TweetTokenizer
sys.path.append("/home/erikt/projects/newsgac/fasttext-runs")
import tscore

In [2]:
DATADIRTEXT = "/home/erikt/projects/puregome/data/text/"
DATADIRTOKENS = "../data/tokens/"
DATADIRTOKENSTOPIC = "../data/tokens-topic/"
DATEFORMAT = "%Y%m%d"
FREQ = "freq"
IDSTR = "id_str"
NBROFEXAMPLES = 20
TEXT = "text"
TOKEN = "token"
TOKENFILE = "tokens.csv"
USER = "user"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)
        
def tokenize(text):
    return(TweetTokenizer().tokenize(text))

def isInterestingToken(token):
    return(re.search(r"[a-z]",token) and not re.search("http",token))

def getTokenCountsFromTweets(datePattern,query="",baseDict={}):
    tokenCounts = dict(baseDict)
    fileList = sorted(os.listdir(DATADIRTEXT))
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIRTEXT+inFileName,index_col=IDSTR)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query,text):
                    for token in set(tokenize(text.lower())): 
                        if isInterestingToken(token):
                            if not token in tokenCounts: tokenCounts[token] = 0
                            tokenCounts[token] += 1
    return(tokenCounts)

def dictTopN(dictionary,N=NBROFEXAMPLES):
    return([(x[1],x[0]) for x in dictionary.items()][0:N])

In [4]:
NBROFTOKENS = "totalFreq"
NBROFTYPES = "nbrOfWords"
WORDFREQS = "wordFreqs"

def makeTscoreData(tokenList):
    data = { NBROFTOKENS:0, NBROFTYPES:0, WORDFREQS:{} }
    for token in tokenList:
        if not math.isnan(tokenList[token]):
            data[WORDFREQS][token] = tokenList[token]
            data[NBROFTYPES] += 1
            data[NBROFTOKENS] += tokenList[token]
    return(data)

def sortTscores(tscores):
    return({token:tscores[token] \
            for token in sorted(tscores.keys(),key=lambda t:tscores[t],reverse=True)})

In [5]:
def makeDateFromFileName(fileName):
    return(fileName[0:8])

def sortDict(myDict):
    return({m:myDict[m] for m in sorted(myDict.keys(),key=lambda m:myDict[m],reverse=True)})

def writeTokenCounts(tokenCounts,outFileName):
    pd.DataFrame.from_dict([tokenCounts]).T.to_csv(outFileName,index_label=TOKEN,header=[FREQ])
    
def readTokenCounts(date,dataDir=DATADIRTOKENS):
    return(pd.read_csv(dataDir+date+FILESUFFIX,index_col=TOKEN).to_dict(orient="dict")[FREQ])

def combineTokenCounts(tokenCountsList):
    if len(tokenCountsList) == 0: return({})
    else:
        tokenCountsOut = dict(tokenCountsList[0])
        for tokenCounts in tokenCountsList[1:]:
            for token in tokenCounts:
                if token in tokenCountsOut: tokenCountsOut[token] += tokenCounts2[token]
                else: tokenCountsOut[token] = tokenCounts[token]
    return(tokenCountsOut)

## Store token counts for all tweets

In [9]:
FILEPATTERN = "20200"
FILESUFFIX = ".csv.gz"

seen = {}
inFileNames = sorted(os.listdir(DATADIRTEXT))
for inFileName in inFileNames:
    if re.search(FILEPATTERN,inFileName):
        date = makeDateFromFileName(inFileName)
        outFileName = DATADIRTOKENS+date+FILESUFFIX
        if not date in seen and not os.path.exists(outFileName):
            tokenCounts = sortDict(getTokenCountsFromTweets(date))
            writeTokenCounts(tokenCounts,outFileName)

## Store token counts for tweets with topic words

In [None]:
FILEPATTERN = "20200"
FILESUFFIX = ".csv.gz"
QUERYTOPIC = r"corona|covid|mondkapje|rivm|blijfthuis|houvol|huisarts|flattenthecurve"

seen = {}
inFileNames = sorted(os.listdir(DATADIRTEXT))
for inFileName in inFileNames:
    if re.search(FILEPATTERN,inFileName):
        date = makeDateFromFileName(inFileName)
        outFileName = DATADIRTOKENSTOPIC+date+FILESUFFIX
        if not date in seen and not os.path.exists(outFileName):
            writeTokenCounts({},outFileName)
            tokenCounts = sortDict(getTokenCountsFromTweets(date,query=QUERYTOPIC))
            writeTokenCounts(tokenCounts,outFileName)

In [101]:
DATADIRSENT = "../data/sentiment/pattern/"
DEFAULTFILEPATTERN = ""
SENTIMENT = "sentiment"
COUNT = "count"

def getSentimentQuery(query,filePattern=DEFAULTFILEPATTERN):
    fileList = sorted(os.listdir(DATADIRSENT))
    sentimentPerHour = {}
    for inFileName in fileList:
        if re.search(filePattern,inFileName):
            squeal(inFileName)
            try:
                dfSent = pd.read_csv(DATADIRSENT+inFileName,header=None)
                dfText = pd.read_csv(DATADIRTEXT+inFileName)
            except: continue
            dictSent = {dfSent.iloc[i][0]:dfSent.iloc[i][1] for i in range(0,len(dfSent))}
            sentScores = {}
            for i in range(0,len(dfText)):
                if re.search(query,dfText.iloc[i][TEXT],flags=re.IGNORECASE):
                    try:
                        idStr = dfText.iloc[i][IDSTR]
                        sentScore = dictSent[idStr] 
                        sentScores[idStr] = sentScore
                    except: pass
            if len(sentScores) > 0:
                sentiment = sum(sentScores.values())/len(sentScores)
                hour = inFileName[0:11]
                sentimentPerHour[hour] = { SENTIMENT:sentiment, COUNT:len(sentScores) }
    totalSentiment = sum([sentimentPerHour[hour][SENTIMENT]*sentimentPerHour[hour][COUNT] for hour in sentimentPerHour])
    totalCount = sum([sentimentPerHour[hour][COUNT] for hour in sentimentPerHour])
    if totalCount == 0: return(None)
    else: return(totalSentiment/totalCount)

In [7]:
def dateStringToDate(dateString):
    return(datetime.datetime.strptime(dateString,DATEFORMAT))

def dateToDateString(date):
    return(datetime.datetime.strftime(date,DATEFORMAT))

In [136]:
TARGETDATE = "20200316"
PREVIOUSDATE = dateToDateString(dateStringToDate(TARGETDATE)+datetime.timedelta(days=-1))

tokenCounts1 = readTokenCounts(PREVIOUSDATE,dataDir=DATADIRTOKENSTOPIC)
tokenCounts2 = readTokenCounts(TARGETDATE,dataDir=DATADIRTOKENSTOPIC)

In [137]:
tscores1 = makeTscoreData(tokenCounts1)
tscores2 = makeTscoreData(tokenCounts2)
dictTopN(sortTscores(tscore.computeTscore(tscores2,tscores1)))

[(88.80423155811405, '#coronanederland'),
 (61.75866525996601, '#coronavirusnl'),
 (57.03928319351592, 'toespraak'),
 (45.81650686351358, 'mondmaskers'),
 (42.89354718484161, 'lockdown'),
 (40.26474919586335, '#coronacrisis'),
 (39.534911354114726, '#lockdownnl'),
 (39.04831174086576, 'stand'),
 (38.88487436314697, 'tekst'),
 (38.39747500072537, '#coronapocolypse'),
 (36.65463396509332, '#rutte'),
 (36.63270044050758, 'immunity'),
 (36.49595768230355, 'herd'),
 (34.202150710629, 'groepsimmuniteit'),
 (34.112145489260165, 'coronacrisis'),
 (32.74197005731711, 'speech'),
 (32.32950958711355, 'zaken'),
 (31.007227730050985, '#covidー19'),
 (30.76991036460662, '#toespraak'),
 (29.435913937965974, 'zorgverleners')]

In [138]:
import time

for token in "stand speech zaken".split():
    scores[token] = getSentimentQuery(token,filePattern=TARGETDATE+"-23")
    print(scores)
    time.sleep(5)

20200316-23.out.gz
{'#coronapocalypse': 0.09766978228924934, 'intensive': 0.030272982893127423, 'familie': 0.03667318441136217, 'malieveld': 0.01722157962261319, 'corona-app': 0.10772500307276305, 'staat': 0.06519107142857139, 'wetenschappelijke': 0.04429563492063492, 'plessen': 0.0767361111111111, 'hazes': -0.025, 'noodfonds': 0.019901315789473687, 'klachten': -0.0741732804232804, 'gegevens': 0.30252083333333324, '@rivm': 0.05990722078327415, 'zorgveleners': None, 'kwijt': -0.007661616161616162, 'zorgverleners': -0.3113715277777778, 'stand': 0.05283312040805441, 'speech': -0.05129367559523808, 'zaken': 0.019641927083333333}


## Old code

In [None]:
DATEPATTERN = "2020031[1245]"
QUERY = "corona|covid|flattenthecurve|blijfthuis|rivm|mondkapje|huisarts|houvol|zorg"
NBROFEXAMPLES = 10

def getExamples(datePattern,query1=QUERY,query2=""):
    fileList = sorted(os.listdir(DATADIR))
    tweets = {}
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            clear_output(wait=True)
            print(inFileName)
            df = pd.read_csv(DATADIR+inFileName,compression="gzip",index_col=ID)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query1,text) and re.search(query2,text):
                    if text in tweets: tweets[text] += 1
                    else: tweets[text] = 1
    return({tweet:tweets[tweet] for tweet in sorted(tweets.keys(),key=lambda t:tweets[t],reverse=True)})

def getTokenCountsFromTweets(datePattern,query=""):
    tokenCounts = {}
    fileList = sorted(os.listdir(DATADIR))
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIR+inFileName,index_col=IDSTR)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query,text):
                    for token in set(tokenize(text.lower())): 
                        if re.search(r"[a-z]",token):
                            if not token in tokenCounts: tokenCounts[token] = 0
                            tokenCounts[token] += 1
    return(tokenCounts)

def writeData(data,fileName):
    pd.DataFrame(data).to_csv(fileName,index_label=TOKEN)
    
def readData(fileName):
    return(pd.read_csv(fileName,index_col=TOKEN).to_dict())

def dictTopN(dictionary,N=NBROFEXAMPLES):
    return([(x[1],x[0]) for x in dictionary.items()][0:N])

In [None]:
#tokens = readData(TOKENFILE)
for month in "202003 202004 202005".split():
    print(month)
    tokens = makeData(month)
    writeData(tokens,"tokens"+month+".csv")

In [None]:
tscoreData = {}
for date in tokens:
    tscoreData[date] = makeTscoreData(tokens[date])

In [None]:
dictTopN(sortTscores(tscore.computeTscore(tscoreData["20200312"],tscoreData["20200311"])),N=20)

In [None]:
dictTopN(sortTscores(tscore.computeTscore(tscoreData["20200315"],tscoreData["20200314"])),N=20)

## Analysis

There is a clear impact of the national press conferences on the topic tweets of the two volume peak dates. On 20200312, both the event (*persconferentie*) and the speakers (*rutte* and *kabinet*) are present in the top 20 words selected by the tscore measure. The most important topic in the tweets was the dicussion about school closures (*scholen*, *kinderen*, *onderwijs*, *sluiten*, *ouders* and *onderwijspersoneel*). On 20200315 the main topic is the closure of bars and restaurants (*horeca*). 

In [None]:
dictTopN(getExamples("20200312",query1=QUERY,query2="scholen"))

In [None]:
dictTopN(getExamples("20200313",query1=QUERY,query2="scholen"))