# Fasttext tweet classification

## Data file creation

In [1]:
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import os
import pandas as pd
import pipes
import random
import re
import tempfile
from nltk.tokenize import TweetTokenizer
from IPython.display import clear_output

In [2]:
DATADIR = "../data/"
DISTANCE = "distance"
FACEMASK = "mondkapje"
TOPIC = DISTANCE
#TOPIC = FACEMASK
if TOPIC == FACEMASK: FILETWEETS = TOPIC+"-tweets+nunl.csv"
else: FILETWEETS = TOPIC+"-tweets.csv"
FILEANNOTATIONS = FILETWEETS+".human-labels.txt"
FILEFASTTEXT = "fasttext-"+TOPIC+".csv"
LABELPREFIX = "__label__"
LARGEINT = 9999999999
LABEL = "label"
TEXT = "text"
USER = "user"
IDSTR = "id_str"
IRRELEVANT = "IRRELEVANT"
NEUTRAL = "NEUTRAL"
ANDERS = "ANDERS"
EENS = "EENS"
ONEENS = "ONEENS"
SUPPORTS = "SUPPORTS"
REJECTS = "REJECTS"

In [3]:
tweets = pd.read_csv(DATADIR+FILETWEETS,index_col=IDSTR)
annotations = pd.read_csv(DATADIR+FILEANNOTATIONS,header=None,sep=" ")
mainAnnotator = annotations.iloc[0][0]

In [4]:
def cleanup(text):
    text = re.sub(r"\\n"," ",text)
    text = re.sub(r"https://\S+","",text)
    text = re.sub(r"\s+"," ",text)
    text = text.strip()
    return(text)

def tokenize(text):
    return(" ".join(TweetTokenizer().tokenize(text)))

def preprocess(text):
    return(tokenize(cleanup(text)).lower())

In [5]:
fasttextData = {}
for i in range(0,len(annotations)):
    annotator = annotations.iloc[i][0]
    tweetId = annotations.iloc[i][2]
    if annotator == mainAnnotator and tweetId in list(tweets.index):
        tweetUser = tweets.loc[tweetId][1]
        tweetLabel = annotations.iloc[i][4]
        if tweetLabel == NEUTRAL: tweetLabel = IRRELEVANT
        if tweetLabel == ANDERS: tweetLabel = IRRELEVANT
        if tweetLabel == EENS: tweetLabel = SUPPORTS
        if tweetLabel == ONEENS: tweetLabel = REJECTS
        fasttextData[tweetId] = {LABEL:LABELPREFIX+tweetLabel,\
                                 USER:tweetUser,\
                                 TEXT:preprocess(tweets.loc[tweetId][TEXT])}

In [6]:
len(fasttextData),len(tweets),len(annotations)

(5977, 6835, 6726)

In [7]:
outFile = open(FILEFASTTEXT,"w")
seenTexts = {}
for tweetId in fasttextData:
    text = cleanup(fasttextData[tweetId][TEXT])
    if not text in seenTexts:
        print(fasttextData[tweetId][LABEL],text,file=outFile)
        seenTexts[text] = True
outFile.close()

In [8]:
annotations1 = annotations[annotations[0]==mainAnnotator]
nbrOfAnnotationMinutes = len(set([str(x)[:12] for x in annotations1[1]]))
nbrOfAnnotatedTweets = len(set([str(x)[:12] for x in annotations1[2]]))
print("tweets annotated per minute:",round(nbrOfAnnotatedTweets/nbrOfAnnotationMinutes,1),"; 1000 tweets take:",\
      round(1000/(nbrOfAnnotatedTweets/nbrOfAnnotationMinutes)),"minutes")

tweets annotated per minute: 5.3 ; 1000 tweets take: 189 minutes


In [9]:
annotations2 = annotations[annotations[0]!=mainAnnotator]
nbrOfAnnotationMinutes = len(set([str(x)[:12] for x in annotations2[1]]))
nbrOfAnnotatedTweets = len(set([str(x)[:12] for x in annotations2[2]]))
print("tweets annotated per minute:",round(nbrOfAnnotatedTweets/nbrOfAnnotationMinutes,1),"; 1000 tweets take:",\
      round(1000/(nbrOfAnnotatedTweets/nbrOfAnnotationMinutes)),"minutes")

tweets annotated per minute: 3.0 ; 1000 tweets take: 329 minutes


## Fasttext run and evaluation

In [10]:
import fasttext
import random
from IPython.display import clear_output

In [11]:
PRETRAINEDDIR = "/home/erikt/projects/newsgac/fasttext-runs/"
WIKIFILENAME = "wiki.nl.vec"

In [12]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [13]:
fasttextData = []
inFile = open(FILEFASTTEXT,"r")
for line in inFile: fasttextData.append(line.strip())
inFile.close()
len(fasttextData)

5731

In [14]:
DIM = 100
EPOCH = 100
LR = 0.05
N = 10
TRAIN = "TRAIN"+str(int(random.random()*LARGEINT))
TEST = "TEST"+str(int(random.random()*LARGEINT))
VALIDATION = "VALIDATION"+str(int(random.random()*LARGEINT))

In [15]:
def ranSelect(inList,size):
    outList = []
    selectionList = list(inList)
    while len(outList) < size and len(selectionList) > 0:
        index = int(random.random()*len(selectionList))
        outList.append(selectionList[index])
        del(selectionList[index])
    return(outList)

In [16]:
def runFasttext(fasttextData,dim=DIM,epoch=EPOCH,lr=LR,n=N,squealFlag=True,maxTrain=None):
    predictionCountsTest = []
    predictionLabelsTest = []
    predictionCountsValidation = []
    predictionLabelsValidation = []
    for fold in range(0,n):
        if squealFlag:
            clear_output(wait=True)
            print("starting fold",fold)
        validationStart = round(fold*len(fasttextData)/n)
        validationEnd = round((fold+1)*len(fasttextData)/n)
        if fold < n-1: nextFold = fold+1
        else: nextFold = 0
        testStart = round(nextFold*len(fasttextData)/n)
        testEnd = round((nextFold+1)*len(fasttextData)/n)
        trainFile = open(TRAIN,"w")
        testFile = open(TEST,"w")
        validationFile = open(VALIDATION,"w")
        trainData = []
        validationData = []
        testData = []
        for i in range(0,len(fasttextData)):
            data = fasttextData[i]
            if i >= testStart and i < testEnd: 
                print(data,file=testFile)
                testData.append(data)
            elif i >= validationStart and i < validationEnd: 
                print(data,file=validationFile)
                validationData.append(data)
            else: 
                print(data,file=trainFile)
                trainData.append(data)
        testFile.close()
        trainFile.close()
        validationFile.close()
        if not maxTrain == None and maxTrain < len(trainData):
            trainData = ranSelect(trainData,maxTrain)
            trainFile = open(TRAIN,"w")
            for i in range(0,len(trainData)):
                print(trainData[i],file=trainFile)
            trainFile.close()
        model = fasttext.train_supervised(TRAIN,dim=dim,epoch=epoch,lr=lr)
        predictionCountsValidation.append([*model.test(VALIDATION)])
        predictionLabelsValidation.append(model.predict(validationData))
        predictionCountsTest.append([*model.test(TEST)])
        predictionLabelsTest.append(model.predict(testData))
        os.unlink(TRAIN)
        os.unlink(TEST)
        os.unlink(VALIDATION)
    if squealFlag:
        clear_output(wait=True)
        print("finished")
    return(predictionCountsValidation,predictionLabelsValidation,predictionCountsTest,predictionLabelsTest)

In [17]:
def makeConfusionMatrix(fasttextData,predictionLabels):
    goldLabels = pd.Series([fasttextData[i].split()[0] for i in range(0,len(fasttextData))])
    predictedLabels = pd.Series([x[0] for row in predictionLabels for x in row[0]])
    return(pd.crosstab(goldLabels,predictedLabels))

In [18]:
def evaluate(predictionCounts,predictionLabels,fasttextData,printResults=True):
    caseTotal = 0
    pTotal = 0
    rTotal = 0
    for i in range(0,len(predictionCounts)):
        caseTotal += predictionCounts[i][0]
        pTotal += predictionCounts[i][0]*predictionCounts[i][1]
        rTotal += predictionCounts[i][0]*predictionCounts[i][2]
    precision = round(pTotal/caseTotal,3)
    recall = round(rTotal/caseTotal,3)
    cf = makeConfusionMatrix(fasttextData,predictionLabels)
    for label in ['__label__REJECTS','__label__SUPPORTS','__label__IRRELEVANT']:
        if not label in cf: cf[label] = [0 for i in range(0,len(cf))]
    factor = sum(cf.loc['__label__SUPPORTS'])*sum([cf.iloc[i]['__label__REJECTS'] for i in range(0,len(cf))])/\
             (sum(cf.loc['__label__REJECTS'])*sum([cf.iloc[i]['__label__SUPPORTS'] for i in range(0,len(cf))]))
    if printResults: print("cases: {0}; precision: {1}; recall: {2}; factor: {3:0.3f}".format(caseTotal,precision,recall,factor))
    return((precision,factor))

## Grid search

In [19]:
scoresValidation = {}
scoresTest = {}
for dim in [10,20,50,100,200,300]:
    for epoch in [10,20,50,100,200,300]:
        for lr in [0.05,0.1,0.2]:
            predictionCountsValidation,predictionLabelsValidation,predictionCountsTest,predictionLabelsTest = \
                runFasttext(fasttextData,dim=dim,epoch=epoch,lr=lr,squealFlag=False)
            print(dim,epoch,lr,end=" ")
            key = ",".join([str(dim),str(epoch),str(lr)])
            scoresValidation[key] = evaluate(predictionCountsValidation,predictionLabelsValidation,fasttextData)
            scoresTest[key] = evaluate(predictionCountsTest,predictionLabelsTest,fasttextData)

10 10 0.05 cases: 5731; precision: 0.62; recall: 0.62; factor: 0.454
cases: 5731; precision: 0.618; recall: 0.618; factor: 0.436
10 10 0.1 cases: 5731; precision: 0.641; recall: 0.641; factor: 0.623
cases: 5731; precision: 0.648; recall: 0.648; factor: 0.597
10 10 0.2 cases: 5731; precision: 0.65; recall: 0.65; factor: 0.716
cases: 5731; precision: 0.651; recall: 0.651; factor: 0.690
10 20 0.05 cases: 5731; precision: 0.642; recall: 0.642; factor: 0.632
cases: 5731; precision: 0.648; recall: 0.648; factor: 0.607
10 20 0.1 cases: 5731; precision: 0.644; recall: 0.644; factor: 0.785
cases: 5731; precision: 0.648; recall: 0.648; factor: 0.762
10 20 0.2 cases: 5731; precision: 0.638; recall: 0.638; factor: 0.832
cases: 5731; precision: 0.639; recall: 0.639; factor: 0.805
10 50 0.05 cases: 5731; precision: 0.638; recall: 0.638; factor: 0.795
cases: 5731; precision: 0.64; recall: 0.64; factor: 0.776
10 50 0.1 cases: 5731; precision: 0.631; recall: 0.631; factor: 0.832
cases: 5731; precision:

In [20]:
[(scoresValidation[k],k) for k in sorted(scoresValidation.keys(),\
                                         key=lambda k:scoresValidation[k][0],reverse=True)][0:10]

[((0.652, 0.7659559153777942), '300,10,0.2'),
 ((0.651, 0.7515804619841587), '50,10,0.2'),
 ((0.65, 0.7162866025512842), '10,10,0.2'),
 ((0.65, 0.7884518650363721), '200,10,0.2'),
 ((0.649, 0.7352314707919649), '100,10,0.2'),
 ((0.648, 0.7303549676437), '20,10,0.2'),
 ((0.648, 0.7835739239468164), '200,20,0.1'),
 ((0.648, 0.7647916424164823), '300,20,0.1'),
 ((0.646, 0.7433294529863305), '20,20,0.1'),
 ((0.646, 0.7582149948329715), '50,20,0.1')]

In [21]:
[(scoresTest[k],k) for k in sorted(scoresTest.keys(),\
                                         key=lambda k:scoresTest[k][0],reverse=True)]

[((0.655, 0.7216019305818581), '50,10,0.2'),
 ((0.654, 0.7142148418528412), '300,10,0.2'),
 ((0.652, 0.7115461939664086), '100,10,0.2'),
 ((0.652, 0.743293548758491), '200,10,0.2'),
 ((0.651, 0.6899390747469893), '10,10,0.2'),
 ((0.65, 0.613217564954619), '200,10,0.1'),
 ((0.649, 0.6276167644162849), '20,10,0.1'),
 ((0.649, 0.602163551652118), '20,20,0.05'),
 ((0.649, 0.7180754984452168), '20,20,0.1'),
 ((0.649, 0.5979183141336639), '50,20,0.05'),
 ((0.649, 0.6061398649080565), '100,20,0.05'),
 ((0.649, 0.7340539688156245), '300,20,0.1'),
 ((0.648, 0.5973590225895589), '10,10,0.1'),
 ((0.648, 0.6071934543179424), '10,20,0.05'),
 ((0.648, 0.7616283768182868), '10,20,0.1'),
 ((0.648, 0.600145536352102), '50,10,0.1'),
 ((0.648, 0.7363818355243421), '50,20,0.1'),
 ((0.647, 0.6526908915421463), '200,20,0.05'),
 ((0.646, 0.7003014605177273), '20,10,0.2'),
 ((0.646, 0.6972606016533986), '100,20,0.1'),
 ((0.646, 0.7272546099345938), '200,20,0.1'),
 ((0.646, 0.6334301782675035), '300,10,0.1'),


Facemasks: best validation accuracy 0.56 for '300,20,0.2', associated test accuracy 0.55 (15th score).

Social distancing: best validation accuracy 0.65 for '300,10,0.2', associated test accuracy 0.65 (2nd score).

In [22]:
[(scoresValidation[k],k) for k in sorted(scoresValidation.keys(),
                                         key=lambda k:scoresValidation[k][1])][0:10]

[((0.62, 0.4537883439426595), '10,10,0.05'),
 ((0.616, 0.46079177769318613), '20,10,0.05'),
 ((0.616, 0.4874906538931474), '50,10,0.05'),
 ((0.607, 0.5205476475876536), '100,10,0.05'),
 ((0.603, 0.5882633630944304), '200,10,0.05'),
 ((0.601, 0.6044297025465439), '300,10,0.05'),
 ((0.638, 0.6216561290528942), '200,10,0.1'),
 ((0.641, 0.6230211409586551), '10,10,0.1'),
 ((0.642, 0.6312127182773815), '50,20,0.05'),
 ((0.642, 0.6316999596338683), '10,20,0.05')]

In [23]:
[(scoresTest[k],k) for k in sorted(scoresTest.keys(),
                                         key=lambda k:scoresTest[k][1])][0:10]

[((0.614, 0.42053698136816564), '20,10,0.05'),
 ((0.618, 0.4358626450605039), '10,10,0.05'),
 ((0.612, 0.44943180964119833), '50,10,0.05'),
 ((0.605, 0.4810708851974767), '100,10,0.05'),
 ((0.603, 0.5446905350679139), '200,10,0.05'),
 ((0.603, 0.5596535634368939), '300,10,0.05'),
 ((0.648, 0.5973590225895589), '10,10,0.1'),
 ((0.649, 0.5979183141336639), '50,20,0.05'),
 ((0.648, 0.600145536352102), '50,10,0.1'),
 ((0.649, 0.602163551652118), '20,20,0.05')]

In [None]:
N = 5
scoresN = {} 
triplets = [(20,300,0.2),(50,200,0.2),(300,200,0.2),(20,200,0.05),(10,300,0.2)]

for triplet in triplets:
    dim,epoch,lr = triplet
    key = " ".join([str(dim),str(epoch),str(lr)])
    scoresN[key] = []
    for i in range(0,N): 
        predictionCounts,predictionLabels = runFasttext(fasttextData,dim=dim,epoch=epoch,lr=lr,squealFlag=False)
        print(dim,epoch,lr,end=" ")
        scoresN[key].append(evaluate(predictionCounts,predictionLabels,fasttextData))

In [None]:
scoresNaverage = {}
for key in scoresN: scoresNaverage[key] = round(np.average([x[1] for x in scoresN[key]]),3)
{key:scoresNaverage[key] for key in sorted(scoresNaverage.keys(),key=lambda k:scoresNaverage[k],reverse=True)}

In [None]:
for dim in [10]:
    for epoch in [900]:
        for lr in [0.2]:
            predictionCounts,predictionLabels = runFasttext(fasttextData,dim=dim,epoch=epoch,lr=lr,squealFlag=False)
            print(dim,epoch,lr,end=" ")
            key = ",".join([str(dim),str(epoch),str(lr)])
            scores[key] = evaluate(predictionCounts,predictionLabels,fasttextData)

## Run with best parameters

In [None]:
if TOPIC == DISTANCE:
    # social distancing
    BESTDIM = 20
    BESTEPOCH = 300
    BESTLR = 0.9
elif TOPIC == FACEMASK:
    # facemasks
    BESTDIM = 10
    BESTEPOCH = 900
    BESTLR = 0.2
else:
    print("unknown topic!",file=sys.stderr)
    sys.exit(1)

In [None]:
predictionCounts,predictionLabels = runFasttext(fasttextData,dim=BESTDIM,epoch=BESTEPOCH,lr=BESTLR)

In [None]:
evaluate(predictionCounts,predictionLabels,fasttextData)

In [None]:
labelCountsGold = {}
for i in range(0,len(fasttextData)):
    label = fasttextData[i].split()[0]
    if label in labelCountsGold: labelCountsGold[label] += 1
    else: labelCountsGold[label] = 1
labelCountsGold

In [None]:
print("baseline accuracy:",round(max(labelCountsGold.values())/sum(labelCountsGold.values()),3))

In [None]:
labelCountsPredicted = {}
for i in range(0,len(predictionLabels)):
    for label in predictionLabels[i][0]:
        if label[0] in labelCountsPredicted: labelCountsPredicted[label[0]] += 1
        else: labelCountsPredicted[label[0]] = 1
labelCountsPredicted

In [None]:
print("total absolute deviation",sum([abs(labelCountsPredicted[l]-labelCountsGold[l]) for l in labelCountsGold]))

In [None]:
print("relative deviation per label:",\
      {l:round(abs(labelCountsPredicted[l]-labelCountsGold[l])/labelCountsGold[l],3) for l in labelCountsGold})

In [None]:
makeConfusionMatrix(fasttextData,predictionLabels)

For the mondkapje data, fasttext predicts 63% of the labels correctly without external dictionary and 66% with a Wikipedia dictionary (baseline: 46%). It overestimates the presence of negative labels and underestimates the level of positive and neutral labels. The amount of irrelevant labels is about right

In [None]:
multiplicationFactors = {}
for label in labelCountsGold:
    multiplicationFactors[label] = labelCountsGold[label]/labelCountsPredicted[label]
multiplicationFactors

For mondkapje tweets, the multiplication factor for the positive label is unrealistically high so we will not use these factors.

In [None]:
predictedNumberOfLabels = 0
for label in labelCountsGold:
    predictedNumberOfLabels += multiplicationFactors[label]*labelCountsPredicted[label]
print(predictedNumberOfLabels,sum(labelCountsGold.values()))

## Relation data size - accuracy

In [None]:
N = 5
expResultsAll = {}
for trainingSize in [100,200,500,1000,2000,5000,10000]:
    expResultsTrainingSize = []
    for i in range(0,N):
        selection = ranSelect(fasttextData,trainingSize)
        predictionCounts,predictionLabels = runFasttext(fasttextData,dim=BESTDIM,epoch=BESTEPOCH,lr=BESTLR,squealFlag=False,maxTrain=trainingSize)
        expResultsTrainingSize.append(evaluate(predictionCounts,predictionLabels,fasttextData,printResults=False))
    average = (np.average([x[0] for x in expResultsTrainingSize]),np.average([x[1] for x in expResultsTrainingSize]))
    expResultsAll[len(selection)] = average
    print("{0:4} {1:0.3f} {2:0.3f}".format(len(selection),average[0],average[1]))
    if len(selection) >= len(fasttextData): break

In [None]:
del(expResultsAll[list(expResultsAll.keys())[-1]])

In [None]:
plt.figure(figsize=(16,6))

ax1 = plt.subplot(121)
plt.plot(list(expResultsAll.keys()),[x[0] for x in list(expResultsAll.values())])
plt.xscale("log")
plt.ylabel("accuracy")
plt.xlabel("training data size")
plt.title("Accuracy related to training size")

ax1 = plt.subplot(122)
plt.plot(list(expResultsAll.keys()),[x[1] for x in list(expResultsAll.values())])
plt.xscale("log")
plt.ylabel("fraction")
plt.xlabel("training data size")
plt.title("Rejects/Supports fraction related to training size")

plt.show()

## Classifying unlabeled tweets

In [None]:
import fasttext
import matplotlib
import numpy as np
import os
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer

In [None]:
DISTANCE = "distance"
FACEMASK = "mondkapje"

TOPIC = DISTANCE
TOPIC = FACEMASK

In [None]:
BASEDIR = "/home/erikt/projects/puregome/data/"
DATADIRECTORY = BASEDIR+"text/"
DATADIRECTORYREDDIT = BASEDIR+"reddit/text/"
DATADIRECTORYNUNL = BASEDIR+"nunl/text/"
TWITTER = "twitter"
REDDIT = "reddit"
NUNL = "nunl"
SOURCES = [TWITTER,REDDIT,NUNL]
DATADIRECTORIES = {TWITTER:DATADIRECTORY,REDDIT:DATADIRECTORYREDDIT,NUNL:DATADIRECTORYNUNL}
FILEFASTTEXT = "fasttext-"+TOPIC+".csv"
LABELPREFIX = "__label__"
NEGATIVE = LABELPREFIX+"NEGATIVE"
POSITIVE = LABELPREFIX+"POSITIVE"
NEUTRAL = LABELPREFIX+"NEUTRAL"
IRRELEVANT = LABELPREFIX+"IRRELEVANT"
SUPPORTS = LABELPREFIX+"SUPPORTS"
REJECTS = LABELPREFIX+"REJECTS"
TEXT = "text"
USER = "user"
TOTAL = "total"
DIM = BESTDIM
EPOCH = BESTEPOCH
LR = BESTLR

In [None]:
def cleanup(text):
    text = re.sub(r"\\n"," ",text)
    text = re.sub(r"https://\S+","",text)
    text = re.sub(r"\s+"," ",text)
    text = text.strip()
    return(text)

def tokenize(text):
    return(" ".join(TweetTokenizer().tokenize(text)))

def preprocess(text):
    return(tokenize(cleanup(text)).lower())

In [None]:
def makeGrepCommandFromQuery(query):
    grepCommand = "grep"
    for orPart in query.split("|"):
        grepCommand += ' -e "'+orPart+'"'
    return(grepCommand)

def readData(datePattern,query,dataDirectory=DATADIRECTORY):
    fileList = sorted(os.listdir(dataDirectory))
    testData = []
    for inFileName in fileList:
        if re.search(datePattern,inFileName) and os.path.exists(dataDirectory+inFileName):
            try:
                fileData = pd.read_csv(dataDirectory+inFileName).drop_duplicates()
                matchedText = [line for line in fileData[TEXT] if re.search(query,line,flags=re.IGNORECASE)]
                matchedTextPreprocessed = [preprocess(line) for line in matchedText]
                testData.extend(matchedTextPreprocessed)
            except:
                pass
    return(testData)
    
def classify(datePattern,query,model,dataDirectory=DATADIRECTORY):
    testData = readData(datePattern,query,dataDirectory)
    predictedLabels = model.predict(testData)
    predictedGroups = pd.DataFrame(predictedLabels[0]).groupby(0).groups
    labelCountsPredicted = {label:len(predictedGroups[label]) for label in predictedGroups}
    nbrOfLabels = sum([labelCountsPredicted[label] for label in labelCountsPredicted if label != IRRELEVANT])
    labelPercentages = { label:round(100*labelCountsPredicted[label]/nbrOfLabels,1) for label in labelCountsPredicted}
    labelPercentages[TOTAL] = nbrOfLabels
    return(labelPercentages)

In [None]:
MAXDAYSPERMONTH = 31

if TOPIC == DISTANCE:
    QUERY = "1[.,]5[ -]*m|afstand.*hou|hou.*afstand|anderhalve[ -]*meter"
elif TOPIC == FACEMASK:
    QUERY = TOPIC
else:
    print("unknown topic!",file=sys.stderr)
    sys.exit(1)
MAXDAYSPERMONTH = 31

model = fasttext.train_supervised(FILEFASTTEXT,dim=BESTDIM,epoch=BESTEPOCH,lr=BESTLR)

In [None]:
results = {}
totals = {}
for source in SOURCES:
    dataDirectory = DATADIRECTORIES[source]
    total = 0
    results[source] = {}
    for month in "202002 202003 202004 202005 202006 202007".split():
        for day in range(1,MAXDAYSPERMONTH+1):
            date = month+str(day).zfill(2)
            try:
                labels = classify(date,QUERY,model,dataDirectory=dataDirectory)
                if not REJECTS in labels: labels[REJECTS] = 0
                if not SUPPORTS in labels: labels[SUPPORTS] = 0
                if not IRRELEVANT in labels: labels[IRRELEVANT] = 0
                if labels[REJECTS]+labels[SUPPORTS]+labels[IRRELEVANT] > 0:
                    results[source][date] = labels
                    print(source,date,labels[REJECTS],labels[SUPPORTS],labels[IRRELEVANT],labels[TOTAL])
                    total += labels[TOTAL]
            except:
                print(source,date,"error")
    totals[source] = total
    print("total relevant found:",total)

## Visualization over time

In [None]:
import datetime
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
def movingAverage(numberList,size):
    newList = []
    for i in range(0,len(numberList)):
        numberUsed = 1
        newList.append(numberList[i])
        for j in range(1,size):
            if i-j >= 0:
                newList[i] += numberList[i-j]
                numberUsed += 1
        newList[i] /= numberUsed
    return(newList)

In [None]:
DATEFORMAT = "%Y%m%d"
DAYSCOMBINED = 7
STARTDATE = "20200310"
LABELS = {TWITTER:"Twitter ({0})".format(totals[TWITTER]),\
          NUNL:"Nu.nl ({0})".format(totals[NUNL]),\
          REDDIT:"Reddit ({0})".format(totals[REDDIT])}
if TOPIC == DISTANCE:
    PLOTFILENAME = "social-distancing-all.png"
elif TOPIC == FACEMASK:
    PLOTFILENAME = "mondkapje-all.png"
else:
    print("unknown topic!",file=sys.stderr)
    sys.exit(1)

x = {}
ySupports = {}
plt.subplots(figsize=(12,6))
font = {"size":12}
matplotlib.rc("font",**font)
ax = plt.subplot(111)
for source in [REDDIT,NUNL,TWITTER]:
    x[source] = [datetime.datetime.strptime(k,DATEFORMAT) for k in results[source].keys()]
    ySupports[source] = movingAverage([results[source][k][SUPPORTS] for k in results[source].keys()],DAYSCOMBINED)
    xPart = [x[source][i] for i in range(0,len(x[source])) if x[source][i] >= datetime.datetime.strptime(STARTDATE,DATEFORMAT)] 
    ySupportsPart = [ySupports[source][i] for i in range(0,len(x[source])) if x[source][i] >= datetime.datetime.strptime(STARTDATE,DATEFORMAT)] 
    plt.plot_date(xPart,ySupportsPart,fmt="-",label=LABELS[source])
plt.legend()
plt.ylabel("percentage")
plt.xlabel("day/month")
if TOPIC == FACEMASK:
    plt.title("Support for general public not wearing face masks per medium over time (average over "+str(DAYSCOMBINED)+" days)")
elif TOPIC == DISTANCE:
    plt.title("Support for social distancing per medium over time (average over "+str(DAYSCOMBINED)+" days)")
else:
    print("unknown topic!",file=sys.stderr)
    sys.exit(1)    
plt.xticks([datetime.datetime.strptime(d,DATEFORMAT) for d in "20200301 2020401 20200501 20200601 20200701 20200801".split()])
ax.xaxis.set_major_formatter(mdates.DateFormatter("%-d/%-m"))
plt.savefig(PLOTFILENAME)
plt.show()

In [None]:
ANNOTATED = "annotated"
ANNOTATEDFILE = "manual-annotation.csv"
DATE = "date"
EENS = "EENS"
ONEENS = "ONEENS"
ANDERS = "ANDERS"
TOTAL = "total"

In [None]:
df = pd.read_csv(ANNOTATEDFILE,index_col=DATE,dtype={EENS:"float",ONEENS:"float",ANDERS:"float"})
totals = []
for i in range(0,len(df)):
    total = df.iloc[i][EENS]+df.iloc[i][ONEENS]+df.iloc[i][ANDERS]
    if df.iloc[i][EENS]+df.iloc[i][ONEENS] > 0:
        df.iloc[i][EENS] = 100*df.iloc[i][EENS]/(total-df.iloc[i][ANDERS])
        df.iloc[i][ONEENS] = 100*df.iloc[i][ONEENS]/(total-df.iloc[i][ANDERS])
        df.iloc[i][ANDERS] = 100*df.iloc[i][ANDERS]/(total-df.iloc[i][ANDERS])
    else: 
        df.iloc[i][ANDERS] = 100
    totals.append(total)
df[TOTAL] = totals
annotatedDict = df.T.to_dict(orient="dict")
annotatedDict = {d:annotatedDict[d] for d in sorted(annotatedDict.keys())}
results[ANNOTATED] = annotatedDict

In [None]:
PLOTFILEANNOTATED = "manual-annotation.png"

x = {}
ySupports = {}
plt.subplots(figsize=(12,6))
font = {"size":12}
matplotlib.rc("font",**font)
ax = plt.subplot(111)
for source in [TWITTER]:
    x[source] = [datetime.datetime.strptime(k,DATEFORMAT) for k in results[source].keys()]
    ySupports[source] = movingAverage([results[source][k][SUPPORTS] for k in results[source].keys()],DAYSCOMBINED)
    xPart = [x[source][i] for i in range(0,len(x[source])) if x[source][i] >= datetime.datetime.strptime(STARTDATE,DATEFORMAT)] 
    ySupportsPart = [ySupports[source][i] for i in range(0,len(x[source])) if x[source][i] >= datetime.datetime.strptime(STARTDATE,DATEFORMAT)] 
    plt.plot_date(xPart,ySupportsPart,fmt="-",label="predicted")
for source in [ANNOTATED]:
    x[source] = [datetime.datetime.strptime(str(k),DATEFORMAT) for k in results[source].keys()]
    ySupports[source] = movingAverage([results[source][k][EENS] for k in results[source].keys()],DAYSCOMBINED)
    xPart = [x[source][i] for i in range(0,len(x[source])) if x[source][i] >= datetime.datetime.strptime(STARTDATE,DATEFORMAT)] 
    ySupportsPart = [ySupports[source][i] for i in range(0,len(x[source])) if x[source][i] >= datetime.datetime.strptime(STARTDATE,DATEFORMAT)] 
    plt.plot_date(xPart,ySupportsPart,fmt="-",label=ANNOTATED)
plt.title("Support for social distancing per medium over time (average over "+str(DAYSCOMBINED)+" days)")
plt.legend()
plt.savefig(PLOTFILEANNOTATED)
plt.show()

In [None]:
annotatedValues = []
twitterValues = []

for i in range(0,len(x[TWITTER])):
    date = x[TWITTER][i]
    if date >= datetime.datetime.strptime("20200310",DATEFORMAT):
        for j in range(0,len(x[ANNOTATED])):
            if x[ANNOTATED][j] == date:
                twitterValues.append(ySupports[TWITTER][i])
                annotatedValues.append(ySupports[ANNOTATED][j])
np.corrcoef(annotatedValues,twitterValues)[1][0]

In [None]:
PLOTFILENAME = "plot.png"

plt.figure()
plt.scatter(annotatedValues,twitterValues)
plt.ylabel("Predicted support percentages")
plt.xlabel("Annotated support percentages")
plt.title("Annotated and predicted support for social distancing")
plt.savefig(PLOTFILENAME)
plt.show()

In [None]:
allResults = results
results = results[TWITTER]
x = [datetime.datetime.strptime(k,DATEFORMAT) for k in results.keys()]
ySupports = movingAverage([results[k][SUPPORTS] for k in results.keys()],DAYSCOMBINED)
yRejects = movingAverage([results[k][REJECTS] for k in results.keys()],DAYSCOMBINED)
yIrrelevant = movingAverage([results[k][IRRELEVANT] for k in results.keys()],DAYSCOMBINED)

keyDates = []
for i in range(0,len(yIrrelevant)):
    if i < len(yIrrelevant)-1 and \
       (yIrrelevant[i] < 100 and yIrrelevant[i+1] >= 100 or yIrrelevant[i] >= 100 and yIrrelevant[i+1] < 100):
        keyDates.append(list(results.keys())[i])
        print(keyDates[-1])

In [None]:
STARTDATE = "20200310" # keyDates[0]
ENDDATE = "20201231"
PLOTFILEOUT = "social-distancing.png"
PLOTFILEOUT = "mondkapje.png"

results = {k:results[k] for k in sorted(results.keys())}
font = {"size":14}
matplotlib.rc("font",**font)
plt.subplots(figsize=(14,7))

ax1 = plt.subplot(121)
plt.plot_date(x,ySupports,fmt="-",label="Supports")
plt.plot_date(x,yRejects,fmt="-",label="Rejects")
plt.plot_date(x,yIrrelevant,fmt="-",label="Other")
plt.plot_date(x,[100 for i in x],fmt="-",label="100%",color="black")
for date in [STARTDATE]:
    plt.plot_date([datetime.datetime.strptime(date,DATEFORMAT)],[100],color="black")
plt.legend()
plt.ylabel("percentage")
plt.xlabel("day/month")
plt.title("stance on the RIVM policy on "+"social")

dates = [d for d in results.keys()]
xPart = [x[i] for i in range(0,len(dates)) if dates[i] >= STARTDATE and dates[i] <= ENDDATE]
ySupportsPart = [ySupports[i] for i in range(0,len(dates)) if dates[i] >= STARTDATE and dates[i] <= ENDDATE] 
yRejectsPart = [yRejects[i] for i in range(0,len(dates)) if dates[i] >= STARTDATE and dates[i] <= ENDDATE]
ax1.xaxis.set_major_formatter(mdates.DateFormatter("%-d/%-m"))

ax2 = plt.subplot(122)
plt.plot_date(xPart,ySupportsPart,fmt="-",label="Supports")
plt.plot_date(xPart,yRejectsPart,fmt="-",label="Rejects")
plt.legend()
plt.ylabel("percentage")
plt.xlabel("day/month")
plt.title("Nu.nl on social distancing (average over "+str(DAYSCOMBINED)+" days)")
plt.xticks([datetime.datetime.strptime(d,DATEFORMAT) for d in "20200301 2020401 20200501 20200601 20200701 20200801".split()])
ax2.xaxis.set_major_formatter(mdates.DateFormatter("%-d/%-m"))

plt.savefig(PLOTFILEOUT)
plt.show()

In [None]:
counts = {}
for text in fasttextData:
    label = text.split()[0]
    if label in counts: counts[label] += 1
    else: counts[label] = 1
for label in counts: print(round(counts[label]/len(fasttextData),3),label)

## Annotator comparison

In [None]:
NBROFTESTDATA = 100

annotators = []
annotatorData = {}
kappaData = [{},{}]
blockedTweetIds = {}
for i in range(0,len(annotations)):
    annotator = annotations.iloc[i][0]
    tweetId = annotations.iloc[i][2]
    tweetLabel = annotations.iloc[i][4]
    if not annotator in annotatorData: 
        annotatorData[annotator] = {}
        annotators.append(annotator)
    annotatorData[annotator][tweetId] = tweetLabel
    if len(annotatorData[annotator]) <= NBROFTESTDATA: blockedTweetIds[tweetId] = True
    if len(annotators) >= 2 and not tweetId in blockedTweetIds and \
       tweetId in annotatorData[annotators[0]] and tweetId in annotatorData[annotators[1]]:
        kappaData[0][tweetId] = annotatorData[annotators[0]][tweetId]
        kappaData[1][tweetId] = annotatorData[annotators[1]][tweetId]

In [None]:
for annotator in annotatorData:
    labelCount = {}
    for tweetId in annotatorData[annotator]:
        label = annotatorData[annotator][tweetId]
        if not label in labelCount: labelCount[label] = 0
        labelCount[label] += 1
    print(len(annotatorData[annotator]))
    for label in labelCount: print(round(labelCount[label]/len(annotatorData[annotator]),3),label)

In [None]:
from sklearn.metrics import cohen_kappa_score

cohen_kappa_score([kappaData[0][tweetId] for tweetId in kappaData[0]],[kappaData[1][tweetId] for tweetId in kappaData[0]])

In [None]:
MAX = 100

total = 0
identical = 0
for tweetId in annotatorData[mainAnnotator]:
    for annotator in annotators:
        if not annotator == mainAnnotator and tweetId in annotatorData[annotator]:
            total += 1
            if total <= MAX:
                if annotatorData[annotator][tweetId] == annotatorData[mainAnnotator][tweetId]: identical += 1
                else: 
                    print(tweetId,annotatorData[mainAnnotator][tweetId],annotatorData[annotator][tweetId])
                    #print(tweets.loc[tweetId][TEXT])
print(total,identical)