# Experiments with the antivac data of Kunneman et al 2020

F. Kunneman, M. Lambooij, A. Wong, A. van den Bosch, L. Mollema. Monitoring stance towards vaccination in Twitter messages. In: BMC Medical Informatics and Decision Making. 20, 1, p. 1-14, 2020, doi 10.1186/s12911-020-1046-y. Data: http://cls.ru.nl/~fkunneman/data_stance_vaccination.zip

In [68]:
import fasttext
import json
import os
import pandas as pd
import random
import re
from nltk.tokenize import TweetTokenizer
from IPython.display import clear_output

In [72]:
DATADIR = "/home/erikt/projects/puregome/data/data_stance_vaccination/"
DATAFILE = "tweetids_labels.txt"
TWEETSFILE = "getTweetsId.py.out"
POLARITY = "Polarity"
SENTIMENT = "Sentiment"
IDSTR = "id_str"
TEXT = "text"
TOKENIZEDTEXT = "tokenizedtext"
LABELINGTYPE = "Labeling type"

In [70]:
def readAnnotations(inFileName):
    return(pd.read_csv(inFileName,sep="\t",index_col="Tweet ID"))
           
dfAnnotations = readAnnotations(DATADIR+DATAFILE)

In [71]:
dfAnnotations

Unnamed: 0_level_0,Labeling type,Binary,Irrelevance Filter,Polarity,Polarity + Sentiment
Tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
560765984135737346,strict,Other,Irrelevant,Irrelevant,Irrelevant
354874289536761856,strict,Other,Other,Positive,Positive + Other
353117242700988416,strict,Other,Other,Positive,Positive + Frustration
534595037599105024,strict,Other,Irrelevant,Irrelevant,Irrelevant
790283805324152832,strict,Other,Other,Positive,Positive + Other
...,...,...,...,...,...
790103347877535744,one,Negative,Negative,Negative,Negative
799739264128913408,one,Negative,Negative,Negative,Negative
746765559484776448,one,Negative,Negative,Negative,Negative
599494572859883520,one,Negative,Negative,Negative,Negative


In [73]:
TEXT = "text"
FULLTEXT = "full_text"
EXTENDEDTWEET = "extended_tweet"
RETWEETEDSTATUS = "retweeted_status"

def getTweetText(jsonData):
    text = ""
    if TEXT in jsonData: 
        text = jsonData[TEXT]
    if EXTENDEDTWEET in jsonData and \
       FULLTEXT in jsonData[EXTENDEDTWEET]:
        text = jsonData[EXTENDEDTWEET][FULLTEXT]
    if RETWEETEDSTATUS in jsonData and \
       EXTENDEDTWEET in jsonData[RETWEETEDSTATUS] and \
       FULLTEXT in jsonData[RETWEETEDSTATUS][EXTENDEDTWEET]:
        text = jsonData[RETWEETEDSTATUS][EXTENDEDTWEET][FULLTEXT]
    return(text)

In [74]:
def readTweetTexts(inFileName):
    tweetTexts = {}
    inFile = open(inFileName,"r")
    for line in inFile:
        jsonData = json.loads(line.strip())
        idStr = jsonData[IDSTR]
        text = getTweetText(jsonData)
        tweetTexts[idStr] = text
    inFile.close()
    return(tweetTexts)

dictTweetTexts = readTweetTexts(DATADIR+TWEETSFILE)

In [78]:
def makeFasttextData(dfAnnotations,dictTweetTexts,labelingType=""):
    fasttextData = []
    for idStr in dictTweetTexts:
        polarity = re.sub(" ","_",dfAnnotations.loc[int(idStr)][POLARITY])
        tokenizedText = " ".join(TweetTokenizer().tokenize(dictTweetTexts[idStr])).lower()
        if labelingType == "" or dfAnnotations.loc[int(idStr)][LABELINGTYPE] == labelingType:
            fasttextData.append({IDSTR:idStr,POLARITY:polarity,TOKENIZEDTEXT:tokenizedText})
    return(fasttextData)
        
fasttextData = makeFasttextData(dfAnnotations,dictTweetTexts,labelingType="strict")

In [79]:
len(fasttextData)

2052

In [80]:
PRETRAINEDDIR = "/home/erikt/projects/newsgac/fasttext-runs/"
WIKIFILENAME = "wiki.nl.vec"
DIM = 300
EPOCH = 100
LR = 0.1
N = 10
LARGEINT = 9999999999
LABELPREFIX = "__label__"
TRAIN = "TRAIN"+str(int(random.random()*LARGEINT))
TEST = "TEST"+str(int(random.random()*LARGEINT))

In [81]:
def runFasttext(fasttextData,dim=DIM,epoch=EPOCH,lr=LR,n=N):
    predictionCounts = []
    predictionLabels = []
    for fold in range(0,n):
        clear_output(wait=True)
        print("starting fold",fold)
        testStart = round(fold*len(fasttextData)/n)
        testEnd = round((fold+1)*len(fasttextData)/n)
        trainFile = open(TRAIN,"w")
        testFile = open(TEST,"w")
        testData = []
        for i in range(0,len(fasttextData)):
            data = LABELPREFIX+fasttextData[i][POLARITY]+" "+fasttextData[i][TOKENIZEDTEXT]
            if i < testStart or i >= testEnd: 
                print(data,file=trainFile)
            else: 
                print(data,file=testFile)
                testData.append(data)
        testFile.close()
        trainFile.close()
        model = fasttext.train_supervised(TRAIN,dim=dim,epoch=epoch,lr=lr) #,pretrainedVectors=PRETRAINEDDIR+WIKIFILENAME)
        predictionCounts.append([*model.test(TEST)])
        predictionLabels.extend(model.predict(testData)[0])
        os.unlink(TRAIN)
        os.unlink(TEST)
    clear_output(wait=True)
    print("finished")
    return(predictionCounts,predictionLabels)

In [82]:
predictionCounts,predictionLabels = runFasttext(fasttextData,dim=300,epoch=50)

finished


In [83]:
def showOverallPrecision(predictionCounts):
    caseTotal = 0
    pTotal = 0
    rTotal = 0
    for i in range(0,len(predictionCounts)):
        caseTotal += predictionCounts[i][0]
        pTotal += predictionCounts[i][0]*predictionCounts[i][1]
        rTotal += predictionCounts[i][0]*predictionCounts[i][2]
    print("cases: {0}; precision: {1}; recall: {2}".format(caseTotal,round(pTotal/caseTotal,3),round(rTotal/caseTotal,3)))
    
showOverallPrecision(predictionCounts)

cases: 2052; precision: 0.533; recall: 0.533


In [84]:
GOLD = "gold"
PREDICTED = "predicted"
CORRECT = "correct"

labelResults = {}
for i in range(0,len(fasttextData)):
    labelGold = fasttextData[i][POLARITY]
    labelPredicted = re.sub(LABELPREFIX,"",predictionLabels[i][0])
    if not labelGold in labelResults: labelResults[labelGold] = {GOLD:0,PREDICTED:0,CORRECT:0}
    if not labelPredicted in labelResults: labelResults[labelPredicted] = {GOLD:0,PREDICTED:0,CORRECT:0}
    labelResults[labelGold][GOLD] += 1
    labelResults[labelPredicted][PREDICTED] += 1
    if labelGold == labelPredicted: labelResults[labelGold][CORRECT] += 1
for label in labelResults:
    precision = labelResults[label][CORRECT]/labelResults[label][PREDICTED]
    recall = labelResults[label][CORRECT]/labelResults[label][GOLD]
    f1 = 2*precision*recall/(precision+recall)
    print("precision:",round(precision,3),"recall:",round(recall,3),"f1:",round(f1,3),label)

precision: 0.344 recall: 0.276 f1: 0.307 Negative
precision: 0.234 recall: 0.15 f1: 0.182 Not_clear
precision: 0.313 recall: 0.268 f1: 0.289 Neutral
precision: 0.496 recall: 0.504 f1: 0.5 Irrelevant
precision: 0.649 recall: 0.739 f1: 0.691 Positive


In [85]:
def countGoldLabels(fasttextData):
    labelCountsGold = {}
    for i in range(0,len(fasttextData)):
        label = fasttextData[i][POLARITY]
        if label in labelCountsGold: labelCountsGold[label] += 1
        else: labelCountsGold[label] = 1
    return(labelCountsGold)

labelCountsGold = countGoldLabels(fasttextData)
labelCountsGold

{'Negative': 228,
 'Not_clear': 167,
 'Irrelevant': 421,
 'Positive': 982,
 'Neutral': 254}

In [86]:
def getBaselineAccuracy(labelCountsGold):
    return(round(max(labelCountsGold.values())/sum(labelCountsGold.values()),3))

print("baseline accuracy:",getBaselineAccuracy(labelCountsGold))

baseline accuracy: 0.479


In [87]:
def countPredictedLabels(predictionLabels):
    labelCountsPredicted = {}
    for i in range(0,len(predictionLabels)):
        label = predictionLabels[i][0]
        if label in labelCountsPredicted: labelCountsPredicted[label] += 1
        else: labelCountsPredicted[label] = 1
    return(labelCountsPredicted)

labelCountsPredicted = countPredictedLabels(predictionLabels)
labelCountsPredicted

{'__label__Negative': 183,
 '__label__Neutral': 217,
 '__label__Irrelevant': 427,
 '__label__Positive': 1118,
 '__label__Not_clear': 107}

In [88]:
def makeConfusionMatrix(fasttextData,predictionLabels):
    goldLabels = pd.Series([fasttextData[i][POLARITY] for i in range(0,len(fasttextData))])
    predictedLabels = pd.Series([predictionLabels[i][0] for i in range(0,len(predictionLabels))])
    return(pd.crosstab(goldLabels,predictedLabels))

makeConfusionMatrix(fasttextData,predictionLabels)

col_0,__label__Irrelevant,__label__Negative,__label__Neutral,__label__Not_clear,__label__Positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Irrelevant,212,27,38,22,122
Negative,37,63,20,15,93
Neutral,41,19,68,17,109
Not_clear,33,18,23,25,68
Positive,104,56,68,28,726


In [89]:
print("total absolute deviation",sum([abs(labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l]) for l in labelCountsGold]))

total absolute deviation 284


In [90]:
print("relative deviation per label:",\
      {l:round((labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l])/labelCountsGold[l],3) for l in labelCountsGold})

relative deviation per label: {'Negative': -0.197, 'Not_clear': -0.359, 'Irrelevant': 0.014, 'Positive': 0.138, 'Neutral': -0.146}


# Combine stance data sets with different topics

In [98]:
TOPIC = "mondkapje"
FILETWEETS = TOPIC+"-tweets.csv"
FILEANNOTATIONS = "human-labels-"+TOPIC+"-tweets.txt"
LABEL = "label"
TEXT = "text"
USER = "user"
DATASIZEANTIVAC = len(fasttextData)

In [92]:
def readAnnotations(inFileName):
    return(pd.read_csv(inFileName,header=None,sep=" "))

dfAnnotations = readAnnotations(FILEANNOTATIONS)

In [93]:
def readTweets(inFileName):
    return(pd.read_csv(inFileName,header=None,index_col=0))

dfTweets = readTweets(FILETWEETS)

In [94]:
def convertLabel(label):
    label = re.sub("NEUTRAL","Neutral",label)
    label = re.sub("POSITIVE","Positive",label)
    label = re.sub("NEGATIVE","Negative",label)
    label = re.sub("IRRELEVANT","Irrelevant",label)
    return(label)

def makeFasttextData(dfAnnotations,dfTweets):
    fasttextData = []
    seen = {}
    for i in range(0,len(dfAnnotations)):
        tweetId = dfAnnotations.iloc[i][2]
        if tweetId in list(dfTweets.index):
            tweetLabel = convertLabel(dfAnnotations.iloc[i][4])
            tweetUser = dfTweets.loc[tweetId][1]
            tokenizedText = " ".join(TweetTokenizer().tokenize(dfTweets.loc[tweetId][2])).lower()
            if not tokenizedText in seen:
                seen[tokenizedText] = True
                fasttextData.append({IDSTR:tweetId,POLARITY:tweetLabel,TOKENIZEDTEXT:tokenizedText})
    return(fasttextData)

fasttextDataTopic = makeFasttextData(dfAnnotations,dfTweets)

In [95]:
fasttextDataCombined = fasttextData+fasttextDataTopic

In [96]:
len(fasttextData),len(fasttextDataTopic),len(fasttextDataCombined)

(2052, 593, 2645)

In [97]:
predictionCounts,predictionLabels = runFasttext(fasttextDataCombined,dim=300,epoch=50)

finished


In [99]:
showOverallPrecision(predictionCounts)

cases: 2645; precision: 0.544; recall: 0.544


### Results data section antivac

In [101]:
labelCountsGold = countGoldLabels(fasttextDataCombined[:DATASIZEANTIVAC])
labelCountsGold

{'Negative': 228,
 'Not_clear': 167,
 'Irrelevant': 421,
 'Positive': 982,
 'Neutral': 254}

In [102]:
print("baseline accuracy:",getBaselineAccuracy(labelCountsGold))

baseline accuracy: 0.479


In [103]:
labelCountsPredicted = countPredictedLabels(predictionLabels[:DATASIZEANTIVAC])
labelCountsPredicted

{'__label__Negative': 184,
 '__label__Neutral': 218,
 '__label__Irrelevant': 441,
 '__label__Positive': 1112,
 '__label__Not_clear': 97}

In [104]:
makeConfusionMatrix(fasttextDataCombined[:DATASIZEANTIVAC],predictionLabels[:DATASIZEANTIVAC])

col_0,__label__Irrelevant,__label__Negative,__label__Neutral,__label__Not_clear,__label__Positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Irrelevant,214,19,39,21,128
Negative,41,68,19,11,89
Neutral,47,13,65,24,105
Not_clear,30,22,28,19,68
Positive,109,62,67,22,722


In [105]:
print("total absolute deviation",sum([abs(labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l]) for l in labelCountsGold]))

total absolute deviation 300


In [106]:
print("relative deviation per label:",\
      {l:round((labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l])/labelCountsGold[l],3) for l in labelCountsGold})

relative deviation per label: {'Negative': -0.193, 'Not_clear': -0.419, 'Irrelevant': 0.048, 'Positive': 0.132, 'Neutral': -0.142}


### Results data section mondkapje

In [108]:
labelCountsGold = countGoldLabels(fasttextDataCombined[DATASIZEANTIVAC:])
labelCountsGold

{'Neutral': 53, 'Irrelevant': 236, 'Negative': 272, 'Positive': 32}

In [109]:
print("baseline accuracy:",getBaselineAccuracy(labelCountsGold))

baseline accuracy: 0.459


In [110]:
labelCountsPredicted = countPredictedLabels(predictionLabels[DATASIZEANTIVAC:])
labelCountsPredicted

{'__label__Neutral': 30,
 '__label__Irrelevant': 225,
 '__label__Negative': 315,
 '__label__Positive': 19,
 '__label__Not_clear': 4}

In [111]:
makeConfusionMatrix(fasttextDataCombined[DATASIZEANTIVAC:],predictionLabels[DATASIZEANTIVAC:])

col_0,__label__Irrelevant,__label__Negative,__label__Neutral,__label__Not_clear,__label__Positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Irrelevant,147,72,10,1,6
Negative,51,195,11,3,12
Neutral,16,29,8,0,0
Positive,11,19,1,0,1


In [112]:
print("total absolute deviation",sum([abs(labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l]) for l in labelCountsGold]))

total absolute deviation 90


In [113]:
print("relative deviation per label:",\
      {l:round((labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l])/labelCountsGold[l],3) for l in labelCountsGold})

relative deviation per label: {'Neutral': -0.434, 'Irrelevant': -0.047, 'Negative': 0.158, 'Positive': -0.406}


The results of the data section mondkapje remain bad. This can be explained by two facts: 1. the distribution whithin the two data sections is quite different (antivac: mostly positive; mondkapje: mostly negative) and 2. the antivac data is much larger than the mondkapje data.