# Coverage tests for collected tweets

Estimate what percentage of Dutch tweets are collected by twiqs.nl

In [1]:
import math
import os
import pandas as pd
import re
import sys
import multiprocessing as mp
from IPython.display import clear_output

In [2]:
DATADIRTEXT = "/home/erikt/projects/puregome/data/text/"
DATADIRCLOUD = "/home/erikt/projects/puregome/data/text-cloud/"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

## Extimate coverage with reply ids

In [32]:
IDSTR = "id_str"
INREPLYTOSTATUSIDSTR = "in_reply_to_status_id_str"

def getReplyPercentage(filePattern):
    files = sorted(os.listdir(DATADIRTEXT))
    found = 0
    missing = 0
    seenIds = {}
    for inFileName in files:
        if re.search(filePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIRTEXT+inFileName,dtype={IDSTR:object})
            seenIds = {**seenIds,**{idStr:True for idStr in df[IDSTR]}}
    files = sorted(os.listdir(DATADIRCLOUD))
    for inFileName in files:
        if re.search(filePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(DATADIRCLOUD+inFileName,dtype={INREPLYTOSTATUSIDSTR:object})
            for idStr in df[INREPLYTOSTATUSIDSTR]:
                if idStr == idStr:
                    if idStr in seenIds: found += 1
                    else: missing += 1
    return(found/(found+missing))

In [None]:
filePattern = "202007"
percentage = getReplyPercentage(filePattern)
print(f"target date: {filePattern}; percentage: {percentage}")

20200726-08.out.gz


# Query token language coverage

How many of the tweets containing a query token are written in Dutch?

In [4]:
DATADIRCOVERAGE = "./"
FILEPATTERNCOVERAGE = "20200701-0[0-57-9]|20200701-1[0-7]|20200701-2[12]"
TEXT = "text"
LANG = "lang"
DUTCH = "dutch"
OTHER = "other"
UNKNOWN = "unknown"

def coverage(token):
    files = sorted(os.listdir(DATADIRCOVERAGE))    
    langs = {}
    for inFileName in files:
        if re.search(FILEPATTERNCOVERAGE,inFileName):
            df = pd.read_csv(inFileName,compression="gzip")
            for i in range(0,len(df)):
                try:
                    if re.search(r'\b'+token+r'\b',df.iloc[i][TEXT]):
                        lang = df.iloc[i][LANG]
                        if lang in langs: langs[lang] += 1
                        else: langs[lang] = 1
                except: pass
    return(langs)

def coverageDutchOld(token):
    langs = coverage(token)
    summary = {DUTCH:0,OTHER:0}
    for lang in langs:
        if lang == DUTCH: summary[DUTCH] += langs[lang]
        elif lang != UNKNOWN: summary[OTHER] += langs[lang]
    if summary[DUTCH] == 0: return(0)
    else: return(summary[DUTCH]/(summary[DUTCH]+summary[OTHER]))
    
def coverageDutch(token,tweetsLang):
    summary = {DUTCH:0,OTHER:0}
    for text in tweetsLang.keys():
        if token in text.split():
            if tweetsLang[text] == DUTCH: summary[DUTCH] += 1
            elif tweetsLang[text] != UNKNOWN: summary[OTHER] += 1
    if summary[DUTCH] == 0: return(0)
    else: return(summary[DUTCH]/(summary[DUTCH]+summary[OTHER]))

## Find best keywords

In [5]:
from nltk.tokenize import TweetTokenizer

In [6]:
SKIPTOKENS = "rt amsterdam the woof rosé de en in is van me ben we via nos juno br staysafe spnmarais arcinho kamafotos youtube \
update amp bts_twt blackpink baby and omg best ygofficialblink on ig goal wonderland ten instagram top you wonwoo daniel \
1advancehbdmaheshbabu with open of for vlive ver to netherlands jungkook up joonie jennie from super my kpop klopp gt vlog queen \
nct love by boy bergwijn zoe twitter teen te stop so more master lee hahahaha gogh god ever bruyne an ameen tik this stream slap \
school one oh ni man level ko just im hi here help he duit deluxe day da better beef be art aameen zico yes wtf wtaf winner will \
vn video vc total that team superior street snoop re pubg pink oopsie no new ne namkook moots moon miss maam look link la ka juliet \
jinnie jinkook his hey hahahahaha goals go gemes fan een dus do black back am all aaron zombie yep yeah woop won win weekend war un \
tingz thank tekken taekook sterling star stan sorry snap sex sb19official robin poor pls out onvres onde omfg ok ns now not need \
moonbin mood min mein maap maaf live let legend left laptop ke kangen jimin jan it iq how hot hoodie holy her hee happy hahahahah \
haha ha groningen got gg gente funny ft ff fc engineer eh ee down donde dm damn cute chief can bro big bestie banget at as \
xiaojun gee advancehbdmaheshbabu weareoneexo kang tier straykids kun dream tiktok than kevin jin haechan golden boys actorvijay \
unnie tweets pop only mochi missing like genie don bts alice it's he's i'm let's zone tok she mo made kids howyoulikethat era el \
duo drop angel allah urstrulymahesh winwin exo official dance hours heels has el edit proud".split()
SKIPTOKENS = r", . ! ? : ; - ( ) @ # $ / [ ] < >  ' + = | .. ... ` & \" ".split()
SKIPTOKENS = "😉 ° 🥇 🥈 🥉 – ¿ ¥ ☀ wonderland".split()

In [7]:
FILEPATTERN = "202006"
TEXT = "text"
BASELEXICON = "".split()
ALLCOUNT = 596893 # 5623571 # 22798953 # 22798953
LEFT200 = 61599
LEFT100 = 80258 # 349598 # 1724488 # 1231745
LEFT200100 = 43622 # 179089 # 801171 # 732526
LEFT300200100 = 34583 # 130887 # 557205 # 531315
LEFT400300200100 = 29528 # 109099 # 451231 # 439565
LEFT500400300200100 = 0 # # 394609 # 387447
LEFT600500400300200100 = 0 # # 357003 # 352026
LEFT700600500400300200100 = 0 # # 329054 # 324628
LEFT800700600500400300200100 = 0 # # 306745 # 303422

results = []

def cleanupText(text):
    text = re.sub(r"\\n"," ",text.lower())
    text = re.sub(r"[#@]"," ",text)
    return(text)

def tokenize(tokenizer,text):
    tokens = tokenizer.tokenize(text)
    return(" ".join(sorted(list(set(tokens)))))

def cleanupTokens(text):
    tokens = set(text.split())
    newTokens = []
    for token in tokens:
        if not re.search(r"http",token) and not token in SKIPTOKENS and len(token) > 1: newTokens.append(token)
    return(" ".join(newTokens))

def getTweetsFile(inFileName,dataDir):
    tweets = {}
    squeal(inFileName)
    tokenizer = TweetTokenizer()
    df = pd.read_csv(dataDir+inFileName)
    for i in range(0,len(df)):
        text = cleanupText(df.iloc[i][TEXT])
        text = tokenize(tokenizer,text)
        text = cleanupTokens(text)
        if LANG in df: tweets[text] = df.iloc[i][LANG]
        else: tweets[text] = True
    return(tweets)

def collectResult(result):
    global results
    results.append(result)

def getTweetsParallel(filePattern=FILEPATTERN,dataDir=DATADIR):
    global results
    files = sorted(os.listdir(dataDir))
    inFileNames = []
    for inFileName in files:
        if re.search(filePattern,inFileName): inFileNames.append(inFileName)
    pool = mp.Pool(mp.cpu_count())
    results = [pool.apply(getTweetsFile,args=[inFileName,dataDir]) for inFileName in inFileNames]
    pool.close()
    while len(results) > 1:
        toBeDeleted = []
        for i in range(0,len(results),2):
            if i < len(results)-1: 
                results[i] = {**results[i],**results[i+1]}
                toBeDeleted.append(i+1)
        toBeDeleted.reverse()
        for i in toBeDeleted: del(results[i])
    return(results[0])

def getTweets(filePattern=FILEPATTERN,dataDir=DATADIR):
    files = sorted(os.listdir(dataDir))
    tokenizer = TweetTokenizer()
    tweets = {}
    for inFileName in files:
        if re.search(filePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(dataDir+inFileName)
            for i in range(0,len(df)):
                text = cleanupText(df.iloc[i][TEXT])
                text = tokenize(tokenizer,text)
                text = cleanupTokens(text)
                if LANG in df: tweets[text] = df.iloc[i][LANG]
                else: tweets[text] = True
    return(tweets)

In [8]:
tweetsLangOrg = getTweets(filePattern=FILEPATTERNCOVERAGE,dataDir=DATADIRCOVERAGE)
tweetsOrg = getTweetsParallel()

20200630-23.out.gz


In [12]:
tweetsLang = dict(tweetsLangOrg)
tweets = dict(tweetsOrg)
lexicon = {token:0 for token in list(BASELEXICON)}
skipTokens = {s:0 for s in SKIPTOKENS}
processedTweetCounts = []

In [13]:
THRESHOLD = 0.9
MAXLEXICONSIZE = 800

coverageDutchScore = ""
while len(lexicon) < MAXLEXICONSIZE:
    frequencies = {}
    processedTweetCount = 0
    counter = 0
    toBeDeleted = []
    if len(lexicon) > 0: lastToken = list(lexicon.keys())[-1]
    else: lastToken = ""
    for text in tweets.keys():
        counter += 1
        if counter%10000 == 0: squeal(str(counter)+"/"+str(len(tweets))+" "+str(len(lexicon))+" "+lastToken+" "+str(coverageDutchScore))
        tokens = text.split()
        if len(set(lexicon.keys()).intersection(tokens)) >= 1:
            toBeDeleted.append(text)
        else:
            processedTweetCount += 1
            for token in tokens:
                if token in frequencies: frequencies[token] += 1
                else: frequencies[token] = 1
    for text in toBeDeleted:
        del(tweets[text])
    #toBeDeletedLang = []
    #for text in tweetsLang.keys():
    #    tokens = text.split()
    #    if len(set(lexicon).intersection(tokens)) >= 1:
    #        toBeDeletedLang.append(text)
    #for text in toBeDeletedLang:
    #    del(tweetsLang[text])    
    processedTweetCounts.append(processedTweetCount)
    frequencies = {k:frequencies[k] for k in sorted(frequencies.keys(),key=lambda k:frequencies[k],reverse=True)}
    for token in frequencies:
        if not token in skipTokens and len(token) > 1:
            coverageDutchScore = coverageDutch(token,tweetsLang)
            squeal(str(len(lexicon))+" "+str(processedTweetCount)+" "+str(len(skipTokens))+" "+str(frequencies[token])+" "+str(coverageDutchScore)+" "+token)
            if coverageDutchScore >= THRESHOLD:
                lexicon[token] = coverageDutchScore
                break
            else: skipTokens[token] = coverageDutchScore
    squeal(str(len(lexicon))+" "+str(processedTweetCount)+" "+str(len(skipTokens))+" "+str(frequencies[list(lexicon.keys())[-1]])+" "+list(lexicon.keys())[-1]+" "+str(coverageDutchScore))
    
" ".join(list(lexicon.keys()))

800 1651848 1570 252 winter 0.9675324675324676


'de een ik is je en rt het in ... niet van op dat voor met ja die ook me te we maar weer zijn wat zo of lekker wel bij tweet .. goed was open geen mooi echt dan dit 2020 vacature na stop via nog 20 ze fijne mijn ben best over man ga aan amsterdam jij mm nu veel eens mooie he door letsel naar gefeliciteerd dankjewel kan al 01 goedemorgen uit 10 deze toch morgen om :) nee wie zeker af heel even da als den heeft gewoon nemen nieuwe leuk br sterkte er hoor 11 twitter 12 graag hoe heerlijk meer prachtig ah inderdaad 13 eu mij mn p2000 waar dag 18 altijd ver vs blonde hier klopt men ff nos 19 doen goeie geweldig onze wolf dus weg bedankt telegraaf mee moet foto komen waarom ten tot up 15 welkom 14 welke 02 net lang snap 71 jullie team alles fijn goede 17 nooit 16 daar allen 21 :/ been school tz hij vandaag utrecht nou uw 03 allemaal 100 ter 24 black zonder want zeg gaat gaan keren 23 gelukkig dank doe niemand word hè alle nieuw klaar tijd gedaan jaar water geniet had tegen vvd gij auto 99 ha

In [14]:
" ".join(list(lexicon.keys()))

'de een ik is je en rt het in ... niet van op dat voor met ja die ook me te we maar weer zijn wat zo of lekker wel bij tweet .. goed was open geen mooi echt dan dit 2020 vacature na stop via nog 20 ze fijne mijn ben best over man ga aan amsterdam jij mm nu veel eens mooie he door letsel naar gefeliciteerd dankjewel kan al 01 goedemorgen uit 10 deze toch morgen om :) nee wie zeker af heel even da als den heeft gewoon nemen nieuwe leuk br sterkte er hoor 11 twitter 12 graag hoe heerlijk meer prachtig ah inderdaad 13 eu mij mn p2000 waar dag 18 altijd ver vs blonde hier klopt men ff nos 19 doen goeie geweldig onze wolf dus weg bedankt telegraaf mee moet foto komen waarom ten tot up 15 welkom 14 welke 02 net lang snap 71 jullie team alles fijn goede 17 nooit 16 daar allen 21 :/ been school tz hij vandaag utrecht nou uw 03 allemaal 100 ter 24 black zonder want zeg gaat gaan keren 23 gelukkig dank doe niemand word hè alle nieuw klaar tijd gedaan jaar water geniet had tegen vvd gij auto 99 ha

In [137]:
for token in lexicon:
    if len(token) < 2: print(token)

☀
่
¥
¿
⚽
💤
🌞
▶
🏆
»
⁦
👩
🌺
🤫
🖕


In [41]:
for token in list(skipTokens.keys())[:20]:
    print(skipTokens[token],token)

0.26370010787486514 rt
0.28060740272065804 1
0.1268478107841757 omg
0.19753086419753085 😂
0.1472161680206063 _
0.1305528134254689 ️
0.22074074074074074 2
0.3265228906545592 ja
0.2545779365788298 ameen
0.10779111979893885 😭
0.2482614742698192 me
0.10048201705598814 hahaha
0.1424706943192065 u
0.11897779639715124 ❤
0.10287664553876158 i
0.19045120671563484 3
0.0712979890310786 c
0.3823088455772114 weer
0.11793611793611794 haha
0.12236503856041131 a


In [None]:
outFile = open("tmp.txt","w")
print("track=",end="",file=outFile)
print(",".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 0 or i % 4 == 3]),file=outFile)
print("track=",end="",file=outFile)
print(",".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 1 or i % 4 == 2]),file=outFile)
outFile.close()
print(" ".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 0 or i % 4 == 3]))
print(" ".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 1 or i % 4 == 2]))

In [None]:
tweetsCopy = tweets
for i in range(0,len(lexicon)):
    if i % 100 == 0: print(i,len(tweetsCopy))
    toBeDeleted = []
    for text in tweetsCopy.keys():
        tokens = text.split()
        if len(set([lexicon[i]]).intersection(tokens)) > 0:
            toBeDeleted.append(text)
    for text in toBeDeleted:
        del(tweetsCopy[text])

In [None]:
print(i,len(tweetsCopy))

In [None]:
print(" ".join([str(processedTweetCounts[i]) for i in range(0,800,100)]))

In [None]:
sortedFrequencies = {k:frequencies[k] for k in sorted(frequencies.keys(),key=lambda k:frequencies[k],reverse=True)}
topTokens800 = [k for k in sortedFrequencies.keys() if re.search(r"[a-z0-9]",k) or \
                                                       (len(k) > 1 and k != ".." and k != "..." or
                                                       (len(k) == 1 and ord(k) >= 127000 and ord(k) < 130000))][0:100]

In [None]:
print(",".join(topTokens100))
print(",".join(topTokens200))
print(",".join(topTokens300))
print(",".join(topTokens400))
print(",".join(topTokens500))
print(",".join(topTokens600))
print(",".join(topTokens700))
print(",".join(topTokens800))

In [None]:
for token in topTokens100+topTokens200+topTokens300+topTokens400: print(token,end=",")
print()

## Token frequencies

In [90]:
THRESHOLDTEXTCAT = 0.99
MAXLEXICONTEXTCAT = 800

tweetsTextCat = dict(tweetsLangOrg)

tokenFreqs = {}
for text in tweetsTextCat.keys():
    if tweetsTextCat[text] == DUTCH:
        for token in text.split():
            if token in tokenFreqs: tokenFreqs[token] += 1
            else: tokenFreqs[token] = 1

#discarded = []
#selected = []
for token in sorted(tokenFreqs.keys(),key=lambda k:tokenFreqs[k],reverse=True)[807+len(discarded):]:
    coverageDutchScore = coverageDutch(token,tweetsTextCat)
    if coverageDutchScore < THRESHOLDTEXTCAT: 
        print("          ",end="")
        discarded.append(token)
    else:
        selected.append(token)
    print(len(selected),tokenFreqs[token],coverageDutch(token,tweetsTextCat),token)
    if len(selected) >= MAXLEXICONTEXTCAT: break

794 479 0.9979166666666667 jarige
795 479 0.9979166666666667 stukje
796 479 0.9958419958419958 beschikbaar
797 478 1.0 geschreven
798 477 1.0 lijst
799 477 1.0 sluiten
800 476 0.9937369519832986 waarvan


In [91]:
TOKENSIGNORE = "‘ … € 🍀 – ° blm corona covid én één etc euro z'n 1,5".split()

toBeDeleted = []
for i in range(len(selected)-1,-1,-1):
    if selected[i] in TOKENSIGNORE: toBeDeleted.append(i)
for i in toBeDeleted: del(selected[i])

In [92]:
for token in selected:
    if len(token) <= 2: print(token)

de
en
op
te
er
ze
zo
al
nl
2e


In [93]:
outFile = open("tmp.txt","w")
print("track=",end="",file=outFile)
print(",".join([selected[i] for i in range(0,len(selected)) if i % 4 == 0 or i % 4 == 3]),file=outFile)
print("track=",end="",file=outFile)
print(",".join([selected[i] for i in range(0,len(selected)) if i % 4 == 1 or i % 4 == 2]),file=outFile)
outFile.close()
print(" ".join([selected[i] for i in range(0,len(selected)) if i % 4 == 0 or i % 4 == 3]))
print(" ".join([selected[i] for i in range(0,len(selected)) if i % 4 == 1 or i % 4 == 2]))

de en van op voor zijn er wel ze geen over meer heb door mijn mensen dus wordt veel hier mij jaar heel alleen hun iets maken onze jullie alle zelf altijd via zien denk omdat staat zit zeker steeds iedereen zeggen allemaal helemaal hem hele hebt geld doet eerste zoals zegt lees misschien kijk verder graag kijken grote goede zeg geven gedaan zwarte snel eerst slavernij zelfs minder werd eigenlijk binnen genoeg waren jouw wereld jaren nl denken samen bijna welke soort vaak fijne juist paar achter geweest excuses idee blij dingen ziet mauricedehond groenlinks tijdens werkt dacht op1npo lezen fijn duidelijk soms links vrouw buiten nieuw zouden racismedebat ging thuis zetten witte geval sterkte aantal artikel ligt volgende maanden sinds bedankt aandacht vanuit prima vrouwen slecht zorg mening rutte thierrybaudet partij anderen benieuwd eten boek echte zei ieder geschiedenis minpres zet rest kwam geef belangrijk groep denkt discriminatie zodat trots totaal blijkbaar gisteren heerlijk zoeken e

In [129]:
tweetsTextCatNew = getTweets(filePattern="20200711",dataDir=DATADIRCOVERAGE)

20200711-01.out.gz


In [135]:
for token in selected:
    coverageDutchScore = coverageDutch(token,tweetsTextCatNew)
    if coverageDutchScore < 0.95:
        print(round(coverageDutchScore,3),token)

0.912 vast
0.85 nieuw
0.947 vrouwen
0.917 woord
0.926 zat
0.935 ter
0.895 idd
0.941 zag
0.941 reactie
0.931 gesprek
0.929 wilde
0.929 gevoel
0 congo
0.917 sowieso
0.929 cultuur
0.944 bestaan
0.889 trein
0.938 mond
0.923 waarheid
0.933 macht
0 zjosdekker
0 asscher
0.923 licht
0.909 vandaan
0 ananninga
0 afschaffing
0.857 tip
0 2ekamertweets
0.909 waarvan
