# Topic keywords

Check which words are typical for topic tweets in different time frames

In [1]:
import csv
import math
import os
import pandas as pd
import re
import sys
from IPython.display import clear_output
from nltk.tokenize import TweetTokenizer
sys.path.append("/home/erikt/projects/newsgac/fasttext-runs")
import tscore

In [2]:
DATADIR = "/home/erikt/projects/puregome/data/text/"
ID = "id_str"
REPLYID = "in_reply_to_status_id_str"
TEXT = "text"
TOKEN = "token"
USER = "user"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

## Count tweets with topic words

In [4]:
def countTweets(datePattern,query):
    count = 0
    fileList = sorted(os.listdir(DATADIR))
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIR+inFileName,compression="gzip",index_col=ID)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                if re.search(query,text): count += 1
    return(count)

In [5]:
def countTweetsReplies(datePattern,query):
    count = 0
    selectedIds = {}
    fileList = sorted(os.listdir(DATADIR))
    for inFileName in fileList:
        if re.search(datePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIR+inFileName,compression="gzip",index_col=ID)
            for i in range(0,len(df)):
                text = df.iloc[i][TEXT]
                replyParent = df.iloc[i][REPLYID]
                if re.search(query,text) or replyParent in selectedIds: 
                    count += 1
                    idstr = df.index[i]
                    selectedIds[idstr] = True
    return(count)

In [None]:
FILEPATTERN = "20200522"

for query in "corona corona|covid corona|covid|flattenthecurve corona|covid|blijfthuis corona|covid|rivm\
              corona|covid|mondkapje corona|covid|huisarts corona|covid|houvol corona|covid|zorg".split():
    count = countTweets(FILEPATTERN,query)
    print(count,query)

In [None]:
FILEPATTERN = "20200522"

for query in "corona corona|covid corona|covid|flattenthecurve corona|covid|blijfthuis corona|covid|rivm\
              corona|covid|mondkapje corona|covid|huisarts corona|covid|houvol corona|covid|zorg".split():
    count = countTweetsReplies(FILEPATTERN,query)
    print(count,query)

**Note**: the query "zorg" produces many false positives. There are irrelevant types ("bezorgd" and "bezorgen"), irrelevant syntactical forms ("ik zorg dat" and "Zorg dat je") and even the correct sense is not always related to the pandemic topic ("zorg voor ouderen/gehandicapten").

## Find other relevant words in topic tweets

In [12]:
AT = r"@"
HASH = r"#"

def getTokensOfMatchedTweets(filePattern,query):
    fileList = sorted(os.listdir(DATADIR))
    matchTokens = {}
    nonMatchTokens = {}
    for inFileName in fileList:
        if re.search(filePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIR+inFileName,compression="gzip",index_col=ID)
            for i in range(0,len(df)):
                text = re.sub("\\\\n"," ",str(df.iloc[i][TEXT]))
                if re.search(query,text):
                    for token in TweetTokenizer().tokenize(text.lower()): 
                        if not token in matchTokens: matchTokens[token] = 0
                        matchTokens[token] += 1
                else:
                    for token in TweetTokenizer().tokenize(text.lower()): 
                        if not token in nonMatchTokens: nonMatchTokens[token] = 0
                        nonMatchTokens[token] += 1
    return(matchTokens,nonMatchTokens)


def readData(fileName):
    return(pd.read_csv(fileName,index_col=TOKEN).to_dict())

def writeData(data,fileName):
    pd.DataFrame(data).to_csv(fileName,index_label=TOKEN)
    
def findKeysStartingWithChar(data,char):
    keysStartingWithChar = []
    for key in data:
        try:
            if re.search(r"^"+char+r"\w",key):
                shortKey = key[1:]
                if shortKey in data: keysStartingWithChar.append(shortKey)
        except: pass
    return(keysStartingWithChar)

def combineKeysStartingWithChar(data,char,keysStartingWithChar):
    for key in keysStartingWithChar:
        data[key] += data[char+key]
        del(data[char+key])
    return(data)

def combineInitialHashAt(data):
    keysStartingWithHash = findKeysStartingWithChar(data,HASH)
    data = combineKeysStartingWithChar(data,HASH,keysStartingWithHash)
    keysStartingWithAt = findKeysStartingWithChar(data,AT)
    data = combineKeysStartingWithChar(data,AT,keysStartingWithAt)
    return(data)

In [5]:
NBROFEXAMPLES = 20

def dictTopN(dictionary,n=NBROFEXAMPLES):
    return([(x[1],x[0]) for x in dictionary.items()][0:n])

def dictBottomN(dictionary,n=NBROFEXAMPLES):
    return([(x[1],x[0]) for x in dictionary.items()][-n:])

In [6]:
NBROFTOKENS = "totalFreq"
NBROFTYPES = "nbrOfWords"
WORDFREQS = "wordFreqs"

def makeTscoreData(tokenList):
    data = { NBROFTOKENS:0, NBROFTYPES:0, WORDFREQS:{} }
    for token in tokenList:
        if not math.isnan(tokenList[token]):
            data[WORDFREQS][token] = tokenList[token]
            data[NBROFTYPES] += 1
            data[NBROFTOKENS] += tokenList[token]
    return(data)

def sortTscores(tscores):
    return({token:tscores[token] for token in sorted(tscores.keys(),key=lambda t:tscores[t],reverse=True)})

In [18]:
FILEPATTERN = "2020102[56789]|2020103"
FILEPATTERN = "2020110[1-7]"
FILEPATTERN = "2020110[89]|2020111[0-4]"
FILEPATTERN = "2020101[89]|2020102[0-4]"
QUERYDISTANCE= r"1[.,]5[ -]*m|afstand.*hou|hou.*afstand|anderhalve[ -]*meter"
QUERYTOPIC = "corona|covid|mondkapje|rivm|blijfthuis|houvol|huisarts|flattenthecurve"
QUERYTOPICSMALL = "corona|covid"
QUERYTRAVEL = "reis|reizen"
QUERYWASHHANDS = "hand.*was|was.*hand"
QUERY = QUERYTOPIC

queryTokens = {}
nonQueryTokens = {}
tscoresDataQuery = {}
tscoresDataNonQuery = {}
queryTokens[FILEPATTERN],nonQueryTokens[FILEPATTERN] = getTokensOfMatchedTweets(FILEPATTERN,QUERY)
tscoresDataQuery[FILEPATTERN] = makeTscoreData(queryTokens[FILEPATTERN])
tscoresDataNonQuery[FILEPATTERN] = makeTscoreData(nonQueryTokens[FILEPATTERN])

20201024-23.out.gz


In [20]:
# 20201018
for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[FILEPATTERN],tscoresDataNonQuery[FILEPATTERN])),n=200): 
    print(x[1],end=" ")

corona mondkapje mondkapjes #corona #coronamaatregelen covid @rivm coronavirus #coronavirus coronacrisis dragen 19 coronamaatregelen de huisarts maatregelen #covid19 in ziekenhuizen #mondkapjes het virus rivm door @hugodejonge besmettingen ziekenhuis coronabesmettingen aantal positief #covid19nl griep mensen test @minpres getest coronapatiënten coronatijd #lockdown #mondkapje alle zorg tweede coronabeleid #spoedwet patiënten golf " coronaregels nieuwe huisartsen lockdown coronagolf cijfers #coronacrisis mondkapjesplicht positieve besmet testen wéér verplicht maanden vaccin #blijfthuis worden we vanwege overleden coronaproof met ic verspreiding dagelijkse afstand #rivm . 😷 % #mondkapjesplicht extra werken stageplek zonder tegen coronatest tijdens scholen #horeca geen #veerkracht https://t.co/nkmurp7uph geheugensteuntje duitsland coronawet #samentegencorona draagt doden zenders @vanbinnenblauw #alleensamen besmetting regio uitgezonden draag wegens 1,5 horeca ondanks #coronawet ggd vrijwe

In [19]:
# 20201018
for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[FILEPATTERN],tscoresDataNonQuery[FILEPATTERN])),n=200): 
    print(round(x[0],1),x[1])

208.0 corona
132.7 mondkapje
123.3 mondkapjes
120.1 #corona
112.8 #coronamaatregelen
107.3 covid
105.4 @rivm
93.2 coronavirus
87.0 #coronavirus
77.3 coronacrisis
72.7 dragen
71.9 19
71.1 coronamaatregelen
67.6 de
67.0 huisarts
63.2 maatregelen
61.9 #covid19
60.7 in
60.2 ziekenhuizen
59.4 #mondkapjes
59.2 het
58.5 virus
56.7 rivm
56.6 door
55.9 @hugodejonge
55.3 besmettingen
55.1 ziekenhuis
54.3 coronabesmettingen
52.3 aantal
51.1 positief
50.9 #covid19nl
50.7 griep
50.3 mensen
49.7 test
48.6 @minpres
47.6 getest
47.1 coronapatiënten
46.8 coronatijd
46.8 #lockdown
46.7 #mondkapje
46.6 alle
45.7 zorg
44.6 tweede
43.7 coronabeleid
43.2 #spoedwet
43.2 patiënten
43.2 golf
43.0 "
42.2 coronaregels
42.0 nieuwe
41.3 huisartsen
41.0 lockdown
41.0 coronagolf
41.0 cijfers
40.7 #coronacrisis
40.4 mondkapjesplicht
40.3 positieve
39.9 besmet
39.9 testen
39.1 wéér
38.5 verplicht
38.2 maanden
38.2 vaccin
38.1 #blijfthuis
37.6 worden
37.5 we
37.3 vanwege
37.2 overleden
37.1 coronaproof
37.0 met
36.7 ic

In [17]:
# 20201108
for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[FILEPATTERN],tscoresDataNonQuery[FILEPATTERN])),n=200): 
    print(round(x[0],1),x[1])

164.8 corona
107.5 mondkapje
88.6 mondkapjes
87.5 #coronamaatregelen
82.5 #corona
82.5 covid
73.2 dragen
70.3 coronavirus
69.3 coronacrisis
66.5 @rivm
63.2 #coronavirus
57.0 coronavaccin
55.5 huisarts
55.0 vaccin
50.6 coronamaatregelen
50.2 19
49.5 #covid19
48.9 @hugodejonge
48.8 mensen
48.6 de
47.7 "
47.5 het
47.2 coronatijd
46.9 maatregelen
46.6 aantal
46.5 #covid19nl
44.3 #mondkapjesplicht
44.1 kabinet
44.0 in
43.9 geen
43.8 virus
43.5 huisartsen
42.8 rivm
41.6 verplicht
41.5 we
41.1 door
40.7 tegen
40.3 besmettingen
37.9 na
37.6 mondkapjesplicht
37.5 testen
36.9 coronabesmettingen
36.3 vanwege
36.3 #mondkapjes
36.2 #mondkapje
36.1 coronaregels
36.0 tijdens
35.2 @minpres
34.8 patiënten
34.6 coronapatiënten
34.4 boete
34.3 #coronavaccin
34.3 jonge
34.2 griep
33.6 reizen
33.6 test
33.3 overheid
32.8 pfizer
32.8 ziekenhuis
32.7 positieve
32.6 medische
32.5 #rivm
31.1 #coronacrisis
30.9 vaccineren
30.7 straks
30.7 #vaccin
30.5 ziekenhuizen
30.4 draag
30.4 aanpak
30.3 positief
30.2 artse

In [14]:
# 20201101
for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[FILEPATTERN],tscoresDataNonQuery[FILEPATTERN])),n=200): 
    print(round(x[0],1),x[1])

181.1 corona
108.0 #coronamaatregelen
97.8 #corona
90.9 mondkapje
90.5 covid
87.6 mondkapjes
79.1 coronavirus
69.5 coronamaatregelen
68.8 #coronavirus
68.3 coronacrisis
67.6 maatregelen
66.7 de
65.8 @rivm
62.8 aantal
60.0 mensen
59.0 19
57.8 #coronadebat
55.2 #covid19
54.9 nieuwe
53.3 coronabesmettingen
52.2 dragen
52.2 zorg
52.0 kabinet
50.2 #covid19nl
49.8 huisarts
49.3 virus
47.4 @hugodejonge
47.4 extra
46.8 #blijfthuis
46.3 in
46.1 vanwege
45.8 #ditismijnzorg
45.5 #persconferentie
44.8 nertsen
44.7 coronatijd
44.6 door
44.2 denemarken
43.9 besmet
43.8 ziekenhuizen
43.1 besmettingen
42.1 #mondkapjes
41.4 testen
41.1 het
41.1 weken
40.9 positief
38.6 positieve
38.6 coronapatiënten
38.5 artsen
38.2 griep
38.0 coronaregels
37.9 rivm
37.7 feesten
37.5 daling
37.2 @minpres
36.6 met
36.6 patiënten
36.4 gelden
36.3 geen
36.0 #lockdown
35.9 ⤵
35.9 #coronacrisis
35.7 voor
35.6 #alleensamen
35.3 doden
35.3 coronabonus
35.3 meer
35.1 stijging
35.1 lockdown
34.9 @djmichaelamani
34.7 feestdag
34

In [10]:
# 20201025
for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[FILEPATTERN],tscoresDataNonQuery[FILEPATTERN])),n=200): 
    print(round(x[0],1),x[1])

204.9 corona
122.0 #corona
108.0 #coronamaatregelen
100.7 covid
100.3 mondkapje
95.1 mondkapjes
89.2 #coronadebat
88.7 de
88.5 #coronavirus
86.2 @rivm
82.6 coronavirus
82.3 coronamaatregelen
74.2 coronacrisis
71.7 huisarts
67.2 maatregelen
64.8 #covid19
62.2 door
61.5 in
56.6 #covid19nl
55.6 het
55.4 ziekenhuis
55.4 19
53.9 zorg
53.7 aantal
53.4 #spoedwet
52.7 griep
52.6 rivm
51.6 we
50.7 coronapatiënten
50.1 tegen
50.0 dragen
49.9 ziekenhuizen
49.0 virus
48.8 besmettingen
48.1 @hugodejonge
47.7 mensen
47.1 beu
46.9 #lockdown
46.5 coronabesmettingen
46.2 coronatijd
46.1 kabinet
45.0 patiënten
42.2 #samentegencorona
41.6 zorgmedewerkers
41.5 horeca
41.3 positief
40.9 coronaregels
39.9 #mondkapjes
39.8 ⁩
39.7 #coronacrisis
39.6 ⁦
39.6 besmet
39.4 #mondkapje
39.1 huisartsen
39.0 #zorgpersoneel
38.7 kamer
38.6 #coronawet
38.6 testen
38.2 vanwege
38.1 #pvv
38.0 #covid
37.7 over
37.4 @minpres
37.3 extra
37.1 coronabeleid
37.0 getest
36.5 geen
35.8 cijfers
35.6 positieve
35.4 verspreiding
35.

In [15]:
for x in dictBottomN(sortTscores(tscore.computeTscore(tscoresDataQuery[FILEPATTERN],tscoresDataNonQuery[FILEPATTERN])),n=20): 
    print(round(x[0],1),x[1])

-79.0 rel
-80.9 moslims
-81.1 michigan
-81.8 uv
-84.9 profeet
-87.3 c
-88.9 rt
-90.6 temp
-90.7 wind
-91.9 trump
-93.7 vochtigheid
-103.8 biden
-104.9 😘
-111.2 luchtdruk
-114.9 ik
-116.2 😂
-123.7 hpa
-132.0 0.0
-137.0 mm
-140.6 °


In [None]:
FILENAME202002 = "query-tokens-202002-202003.csv"
FILENAME202004 = "query-tokens-202004-202005.csv"
NON = "non-"
WEEKS = {"2020020[2-8]":"20200202","20200209|2020021[0-5]":"20200209","2020021[6-9]|2020022[0-2]":"20200216","2020022[3-9]":"20200223",\
         "2020030[1-7]":"20200301","2020030[89]|2020031[0-4]":"20200308","2020031[5-9]|2020032[01]":"20200315","2020032[2-8]":"20200322",\
                                   "20200329|20200330|2020040[1-4]":"20200329",\
         "2020040[5-9]|2020041[01]":"20200405","2020041[2-8]":"20200412","20200419|2020042[0-5]":"20200419","2020042[6-9]|2020043|2020050[1-2]":"20200426",\
         "2020050[3-9]":"20200503","2020051[0-6]":"20200510","2020051[7-9]|2020052[0-3]":"20200517"}

In [None]:
queryTokens202002 = readData(FILENAME202004)
nonQueryTokens202002 = readData(NON+FILENAME202004)
tscoresDataQuery = {}
tscoresDataNonQuery = {}
tscores = {}
for query in queryTokens202002:
    tscoresDataQuery[query] = makeTscoreData(combineInitialHashAt(queryTokens202002[query]))
    tscoresDataNonQuery[query] = makeTscoreData(combineInitialHashAt(nonQueryTokens202002[query]))
    print(query)
    tscores[query] = sortTscores(tscore.computeTscore(tscoresDataQuery[query],tscoresDataNonQuery[query]))
    for x in dictTopN(tscores[query]): print(round(x[0],1),x[1])

In [None]:
# writeData(tscores,"tscores-202004-202005.csv")

In [None]:
tscores202002 = readData("tscores-202002-202003.csv")
tscores202004 = readData("tscores-202004-202005.csv")

In [None]:
TOPICS = "maatregelen mondkapje anderhalve besmet rivm ".split()

data = {}
for topic in TOPICS:
    for date in tscores202002.keys():
        if not topic in data: data[topic] = {}
        data[topic][date] = tscores202002[date][topic]
    for date in tscores202004.keys():
        data[topic][date] = tscores202004[date][topic]

In [None]:
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter

DATEFORMAT = "%Y%m%d"

fig,ax = plt.subplots(figsize=(8,4))
ax.xaxis.set_major_formatter(DateFormatter("%d %b"))
for topic in TOPICS:
    plt.plot_date([datetime.strptime(WEEKS[date],DATEFORMAT) for date in data[topic]],list(data[topic].values()),label=topic,fmt="-")
ax.set(xlabel="date (weeks)",ylabel="t-scores")
plt.title("t-scores for relevant words, comparing selected topic tweets with unselected tweets")
plt.legend()
plt.show()

**Note**: We expected only positive scores for topic words but the graph shows that *anderhalve* had negative scores  in two weeks. This means that most of the tweets containing this word are outside our topic tweets. We checked a sample of these missing tweets and most of them turned out to be on-topic, but they lacked the words *corona* and *covid*.

In [None]:
data["anderhalve"]

In [None]:
for date in tscores202002: 
    print("#####",date)
    for x in dictTopN(sortTscores(tscores202002[date]),n=50): print(round(x[0],1),x[1])
for date in tscores202004:
    print("#####",date)
    for x in dictTopN(sortTscores(tscores202004[date]),n=50): print(round(x[0],1),x[1])

In [None]:
queryTokens202004 = readData(FILENAME202004)
nonQueryTokens202004 = readData(NON+FILENAME202004)
for query in queryTokens202004:
    tscoresDataQuery[query] = makeTscoreData(combineInitialHashAt(queryTokens202004[query]))
    tscoresDataNonQuery[query] = makeTscoreData(combineInitialHashAt(nonQueryTokens202004[query]))
    print(query)
    for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[query],tscoresDataNonQuery[query]))): print(round(x[0],1),x[1])

In [None]:
#writeData(queryTokens,"query-tokens-202004-202005.csv")
#writeData(nonQueryTokens,"non-query-tokens-202004-202005.csv")

In [None]:
query = "2020051[7-9]|2020052[0-3]"
for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[query],tscoresDataNonQuery[query])),n=40): print(round(x[0],1),x[1])

In [None]:
FILEPATTERN = "20200601" 
QUERY= r"1.5m|1,5m|afstand.*hou|hou.*afstand|anderhalve"

queryTokens = {}
nonQueryTokens = {}
tscoresDataQuery = {}
tscoresDataNonQuery = {}
queryTokens[FILEPATTERN],nonQueryTokens[FILEPATTERN] = getTokensOfMatchedTweets(FILEPATTERN,QUERY)
tscoresDataQuery[FILEPATTERN] = makeTscoreData(queryTokens[FILEPATTERN])
tscoresDataNonQuery[FILEPATTERN] = makeTscoreData(nonQueryTokens[FILEPATTERN])
for x in dictTopN(sortTscores(tscore.computeTscore(tscoresDataQuery[FILEPATTERN],tscoresDataNonQuery[FILEPATTERN])),n=40): 
    print(round(x[0],1),x[1])

**Note**: the query "zorg" produces many false positives. There are irrelevant types ("bezorgd" and "bezorgen"), irrelevant syntactical forms ("ik zorg dat" and "Zorg dat je") and even the correct sense is not always related to the pandemic topic ("zorg voor ouderen/gehandicapten").

## Estimate coverage of crawler by looking back for messages with replies

In [None]:
DATEPATTERN = "20200522"

fileList = sorted(os.listdir(DATADIR))
seenIds = {}
for inFileName in fileList:
    if re.search(DATEPATTERN,inFileName):
        df = pd.read_csv(DATADIR+inFileName,compression="gzip",dtype=str)
        known = 0
        unknown = 0
        for i in range(0,len(df)):
            idstr = df.iloc[i][ID]
            seenIds[idstr] = True
            replyParent = df.iloc[i][REPLYID]
            if str(replyParent) != "nan":
                if replyParent in seenIds: known += 1
                else: unknown += 1
        print("{0} {1:.3f}".format(inFileName,round(known/(known+unknown),3)))