# Coverage tests for collected tweets

Estimate what percentage of Dutch tweets are collected by twiqs.nl

In [1]:
import math
import os
import pandas as pd
import re
import sys
import multiprocessing as mp
from IPython.display import clear_output

In [2]:
DATADIR = "/home/erikt/projects/puregome/data/text-202006/"

In [3]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

## Extimate coverage with reply ids

In [None]:
IDSTR = "id_str"
INREPLYTOSTATUSIDSTR = "in_reply_to_status_id_str"

def getReplyPercentage(filePattern,filePatternReference):
    files = sorted(os.listdir(DATADIR))
    targetFiles = {}
    counter = 0
    for inFileName in files:
        if re.search(filePattern,inFileName):
            counter += 1
            df = pd.read_csv(DATADIR+inFileName,dtype={INREPLYTOSTATUSIDSTR:object})
            counter += len(df)
            for idStr in df[INREPLYTOSTATUSIDSTR]:
                if type(idStr) == type("abc"):
                    targetFiles[idStr] = False
    for inFileName in files:
        if re.search(filePatternReference,inFileName):
            df = pd.read_csv(DATADIR+inFileName,dtype={IDSTR:object})
            for idStr in df[IDSTR]: 
                if idStr in targetFiles: targetFiles[idStr] = True
            squeal(inFileName)
    squeal("")
    return(len([x for x in targetFiles if targetFiles[x]]),len(targetFiles),counter)

In [None]:
filePattern = "20200611"
lenPart,lenAll,counter = getReplyPercentage(filePattern,filePatternReference)
print("target date: {0}; reference: {1}; percentage: {2}%; total count: {3}".format(filePattern,filePatternReference,\
                                                                                    round(100*lenPart/lenAll,1),counter))

# Query token language coverage

How many of the tweets containing a query token are written in Dutch?

In [4]:
DATADIRCOVERAGE = "./"
FILEPATTERNCOVERAGE = "20200701-0[0-57-9]|20200701-1[0-7]|20200701-2[12]"
TEXT = "text"
LANG = "lang"
DUTCH = "dutch"
OTHER = "other"
UNKNOWN = "unknown"

def coverage(token):
    files = sorted(os.listdir(DATADIRCOVERAGE))    
    langs = {}
    for inFileName in files:
        if re.search(FILEPATTERNCOVERAGE,inFileName):
            df = pd.read_csv(inFileName,compression="gzip")
            for i in range(0,len(df)):
                try:
                    if re.search(r'\b'+token+r'\b',df.iloc[i][TEXT]):
                        lang = df.iloc[i][LANG]
                        if lang in langs: langs[lang] += 1
                        else: langs[lang] = 1
                except: pass
    return(langs)

def coverageDutchOld(token):
    langs = coverage(token)
    summary = {DUTCH:0,OTHER:0}
    for lang in langs:
        if lang == DUTCH: summary[DUTCH] += langs[lang]
        elif lang != UNKNOWN: summary[OTHER] += langs[lang]
    if summary[DUTCH] == 0: return(0)
    else: return(summary[DUTCH]/(summary[DUTCH]+summary[OTHER]))
    
def coverageDutch(token,tweetsLang):
    summary = {DUTCH:0,OTHER:0}
    for text in tweetsLang.keys():
        if token in text.split():
            if tweetsLang[text] == DUTCH: summary[DUTCH] += 1
            else: summary[OTHER] += 1
    if summary[DUTCH] == 0: return(0)
    else: return(summary[DUTCH]/(summary[DUTCH]+summary[OTHER]))

## Find best keywords

In [5]:
from nltk.tokenize import TweetTokenizer

In [6]:
SKIPTOKENS = "rt amsterdam the woof rosé de en in is van me ben we via nos juno br staysafe spnmarais arcinho kamafotos youtube \
update amp bts_twt blackpink baby and omg best ygofficialblink on ig goal wonderland ten instagram top you wonwoo daniel \
1advancehbdmaheshbabu with open of for vlive ver to netherlands jungkook up joonie jennie from super my kpop klopp gt vlog queen \
nct love by boy bergwijn zoe twitter teen te stop so more master lee hahahaha gogh god ever bruyne an ameen tik this stream slap \
school one oh ni man level ko just im hi here help he duit deluxe day da better beef be art aameen zico yes wtf wtaf winner will \
vn video vc total that team superior street snoop re pubg pink oopsie no new ne namkook moots moon miss maam look link la ka juliet \
jinnie jinkook his hey hahahahaha goals go gemes fan een dus do black back am all aaron zombie yep yeah woop won win weekend war un \
tingz thank tekken taekook sterling star stan sorry snap sex sb19official robin poor pls out onvres onde omfg ok ns now not need \
moonbin mood min mein maap maaf live let legend left laptop ke kangen jimin jan it iq how hot hoodie holy her hee happy hahahahah \
haha ha groningen got gg gente funny ft ff fc engineer eh ee down donde dm damn cute chief can bro big bestie banget at as \
xiaojun gee advancehbdmaheshbabu weareoneexo kang tier straykids kun dream tiktok than kevin jin haechan golden boys actorvijay \
unnie tweets pop only mochi missing like genie don bts alice it's he's i'm let's zone tok she mo made kids howyoulikethat era el \
duo drop angel allah urstrulymahesh winwin exo official dance hours heels has el edit proud".split()
SKIPTOKENS = r", . ! ? : ; - ( ) @ # $ / [ ] < >  ' + = | .. ... ` & \" ".split()

In [7]:
FILEPATTERN = "202006"
TEXT = "text"
BASELEXICON = "".split()
ALLCOUNT = 596893 # 5623571 # 22798953 # 22798953
LEFT200 = 61599
LEFT100 = 80258 # 349598 # 1724488 # 1231745
LEFT200100 = 43622 # 179089 # 801171 # 732526
LEFT300200100 = 34583 # 130887 # 557205 # 531315
LEFT400300200100 = 29528 # 109099 # 451231 # 439565
LEFT500400300200100 = 0 # # 394609 # 387447
LEFT600500400300200100 = 0 # # 357003 # 352026
LEFT700600500400300200100 = 0 # # 329054 # 324628
LEFT800700600500400300200100 = 0 # # 306745 # 303422

results = []

def cleanupText(text):
    text = re.sub(r"\\n"," ",text.lower())
    text = re.sub(r"[#@]"," ",text)
    return(text)

def tokenize(tokenizer,text):
    tokens = tokenizer.tokenize(text)
    return(" ".join(sorted(list(set(tokens)))))

def cleanupTokens(text):
    tokens = set(text.split())
    newTokens = []
    for token in tokens:
        if not re.search(r"http",token) and not token in SKIPTOKENS: newTokens.append(token)
    return(" ".join(newTokens))

def getTweetsFile(inFileName,dataDir):
    tweets = {}
    squeal(inFileName)
    tokenizer = TweetTokenizer()
    df = pd.read_csv(dataDir+inFileName)
    for i in range(0,len(df)):
        text = cleanupText(df.iloc[i][TEXT])
        text = tokenize(tokenizer,text)
        text = cleanupTokens(text)
        if LANG in df: tweets[text] = df.iloc[i][LANG]
        else: tweets[text] = True
    return(tweets)

def collectResult(result):
    global results
    results.append(result)

def getTweetsParallel(filePattern=FILEPATTERN,dataDir=DATADIR):
    global results
    files = sorted(os.listdir(dataDir))
    inFileNames = []
    for inFileName in files:
        if re.search(filePattern,inFileName): inFileNames.append(inFileName)
    pool = mp.Pool(mp.cpu_count())
    results = [pool.apply(getTweetsFile,args=[inFileName,dataDir]) for inFileName in inFileNames]
    pool.close()
    while len(results) > 1:
        toBeDeleted = []
        for i in range(0,len(results),2):
            if i < len(results)-1: 
                results[i] = {**results[i],**results[i+1]}
                toBeDeleted.append(i+1)
        toBeDeleted.reverse()
        for i in toBeDeleted: del(results[i])
    return(results[0])

def getTweets(filePattern=FILEPATTERN,dataDir=DATADIR):
    files = sorted(os.listdir(dataDir))
    tokenizer = TweetTokenizer()
    tweets = {}
    for inFileName in files:
        if re.search(filePattern,inFileName):
            squeal(inFileName)
            df = pd.read_csv(dataDir+inFileName)
            for i in range(0,len(df)):
                text = cleanupText(df.iloc[i][TEXT])
                text = tokenize(tokenizer,text)
                text = cleanupTokens(text)
                if LANG in df: tweets[text] = df.iloc[i][LANG]
                else: tweets[text] = True
    return(tweets)

In [8]:
tweetsLangOrg = getTweets(filePattern=FILEPATTERNCOVERAGE,dataDir=DATADIRCOVERAGE)
tweetsOrg = getTweetsParallel()

20200630-23.out.gz


In [38]:
tweetsLang = dict(tweetsLangOrg)
tweets = dict(tweetsOrg)

In [None]:
THRESHOLD = 0.5
MAXLEXICONSIZE = 100

lexicon = {token:0 for token in list(BASELEXICON)}
processedTweetCounts = []
tokenCoverage = {}
skipTokens = {}
coverageDutchScore = ""
while len(lexicon) < MAXLEXICONSIZE:
    frequencies = {}
    processedTweetCount = 0
    counter = 0
    toBeDeleted = []
    if len(lexicon) > 0: lastToken = list(lexicon.keys())[-1]
    else: lastToken = ""
    for text in tweets.keys():
        counter += 1
        if counter%10000 == 0: squeal(str(counter)+"/"+str(len(tweets))+" "+str(len(lexicon))+" "+lastToken+" "+str(coverageDutchScore))
        tokens = text.split()
        if len(set(lexicon.keys()).intersection(tokens)) >= 1:
            toBeDeleted.append(text)
        else:
            processedTweetCount += 1
            for token in tokens:
                if token in frequencies: frequencies[token] += 1
                else: frequencies[token] = 1
    for text in toBeDeleted:
        del(tweets[text])
    toBeDeletedLang = []
    for text in tweetsLang.keys():
        tokens = text.split()
        if len(set(lexicon).intersection(tokens)) >= 1:
            toBeDeletedLang.append(text)
    for text in toBeDeletedLang:
        del(tweetsLang[text])    
    processedTweetCounts.append(processedTweetCount)
    frequencies = {k:frequencies[k] for k in sorted(frequencies.keys(),key=lambda k:frequencies[k],reverse=True)}
    for token in frequencies:
        if not token in skipTokens:
            coverageDutchScore = coverageDutch(token,tweetsLang)
            squeal(str(len(lexicon))+" "+str(processedTweetCount)+" "+str(len(skipTokens))+" "+str(frequencies[token])+" "+str(coverageDutchScore)+" "+token)
            if coverageDutchScore >= THRESHOLD:
                lexicon[token] = coverageDutchScore
                break
            else: skipTokens[token] = coverageDutchScore
    squeal(str(len(lexicon))+" "+str(processedTweetCount)+" "+str(len(skipTokens))+" "+str(frequencies[list(lexicon.keys())[-1]])+" "+list(lexicon.keys())[-1]+" "+str(coverageDutchScore))
    
" ".join(list(lexicon.keys()))

750000/5241310 12 dat 0.8404066073697586


In [35]:
" ".join(list(lexicon.keys()))

'de een ik het je niet van dat voor die zijn maar geen wel ze eens waar worden hebben heb heerlijk waarom zien zonder moeten niets gelijk iedereen ziet anders geniet lijkt eindelijk ingeborgvraagt weten vester zij mogelijk elkaar volgende zeggen 000zit zitten vooral gewenst airbnbird eigenlijk beterschap geleden houden geweest gecondoleerd vreselijk alweer weinig omdat dingen ziek hoeveel wachten'

In [None]:
for token in list(skipTokens.keys())[:20]:
    print(skipTokens[token],token)

In [None]:
outFile = open("tmp.txt","w")
print("track=",end="",file=outFile)
print(",".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 0 or i % 4 == 3]),file=outFile)
print("track=",end="",file=outFile)
print(",".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 1 or i % 4 == 2]),file=outFile)
outFile.close()
print(" ".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 0 or i % 4 == 3]))
print(" ".join([lexicon[i] for i in range(0,len(lexicon)) if i % 4 == 1 or i % 4 == 2]))

In [None]:
tweetsCopy = tweets
for i in range(0,len(lexicon)):
    if i % 100 == 0: print(i,len(tweetsCopy))
    toBeDeleted = []
    for text in tweetsCopy.keys():
        tokens = text.split()
        if len(set([lexicon[i]]).intersection(tokens)) > 0:
            toBeDeleted.append(text)
    for text in toBeDeleted:
        del(tweetsCopy[text])

In [None]:
print(i,len(tweetsCopy))

In [None]:
print(" ".join([str(processedTweetCounts[i]) for i in range(0,800,100)]))

In [None]:
sortedFrequencies = {k:frequencies[k] for k in sorted(frequencies.keys(),key=lambda k:frequencies[k],reverse=True)}
topTokens800 = [k for k in sortedFrequencies.keys() if re.search(r"[a-z0-9]",k) or \
                                                       (len(k) > 1 and k != ".." and k != "..." or
                                                       (len(k) == 1 and ord(k) >= 127000 and ord(k) < 130000))][0:100]

In [None]:
print(",".join(topTokens100))
print(",".join(topTokens200))
print(",".join(topTokens300))
print(",".join(topTokens400))
print(",".join(topTokens500))
print(",".join(topTokens600))
print(",".join(topTokens700))
print(",".join(topTokens800))

In [None]:
for token in topTokens100+topTokens200+topTokens300+topTokens400: print(token,end=",")
print()