# Experiments with the antivac data of Kunneman et al 2020

F. Kunneman, M. Lambooij, A. Wong, A. van den Bosch, L. Mollema. Monitoring stance towards vaccination in Twitter messages. In: BMC Medical Informatics and Decision Making. 20, 1, p. 1-14, 2020, doi 10.1186/s12911-020-1046-y. Data: http://cls.ru.nl/~fkunneman/data_stance_vaccination.zip

In [1]:
import fasttext
import json
import os
import pandas as pd
import random
import re
from nltk.tokenize import TweetTokenizer
from IPython.display import clear_output

In [2]:
DATADIR = "/home/erikt/projects/puregome/data/data_stance_vaccination/"
DATAFILE = "tweetids_labels.txt"
TWEETSFILE = "getTweetsId.py.out"
POLARITY = "Polarity"
SENTIMENT = "Sentiment"
IDSTR = "id_str"
TEXT = "text"
TOKENIZEDTEXT = "tokenizedtext"

In [3]:
def readAnnotations(inFileName):
    return(pd.read_csv(inFileName,sep="\t",index_col="Tweet ID"))
           
dfAnnotations = readAnnotations(DATADIR+DATAFILE)

In [4]:
TEXT = "text"
FULLTEXT = "full_text"
EXTENDEDTWEET = "extended_tweet"
RETWEETEDSTATUS = "retweeted_status"

def getTweetText(jsonData):
    text = ""
    if TEXT in jsonData: 
        text = jsonData[TEXT]
    if EXTENDEDTWEET in jsonData and \
       FULLTEXT in jsonData[EXTENDEDTWEET]:
        text = jsonData[EXTENDEDTWEET][FULLTEXT]
    if RETWEETEDSTATUS in jsonData and \
       EXTENDEDTWEET in jsonData[RETWEETEDSTATUS] and \
       FULLTEXT in jsonData[RETWEETEDSTATUS][EXTENDEDTWEET]:
        text = jsonData[RETWEETEDSTATUS][EXTENDEDTWEET][FULLTEXT]
    return(text)

In [5]:
def readTweetTexts(inFileName):
    tweetTexts = {}
    inFile = open(inFileName,"r")
    for line in inFile:
        jsonData = json.loads(line.strip())
        idStr = jsonData[IDSTR]
        text = getTweetText(jsonData)
        tweetTexts[idStr] = text
    inFile.close()
    return(tweetTexts)

dictTweetTexts = readTweetTexts(DATADIR+TWEETSFILE)

In [6]:
def makeFasttextData(dfAnnotations,dictTweetTexts):
    fasttextData = []
    for idStr in dictTweetTexts:
        polarity = re.sub(" ","_",dfAnnotations.loc[int(idStr)][POLARITY])
        tokenizedText = " ".join(TweetTokenizer().tokenize(dictTweetTexts[idStr])).lower()
        fasttextData.append({IDSTR:idStr,POLARITY:polarity,TOKENIZEDTEXT:tokenizedText})
    return(fasttextData)
        
fasttextData = makeFasttextData(dfAnnotations,dictTweetTexts)

In [7]:
PRETRAINEDDIR = "/home/erikt/projects/newsgac/fasttext-runs/"
WIKIFILENAME = "wiki.nl.vec"
DIM = 300
EPOCH = 100
LR = 0.1
N = 10
LARGEINT = 9999999999
LABELPREFIX = "__label__"
TRAIN = "TRAIN"+str(int(random.random()*LARGEINT))
TEST = "TEST"+str(int(random.random()*LARGEINT))

In [42]:
def runFasttext(fasttextData,dim=DIM,epoch=EPOCH,lr=LR,n=N):
    predictionCounts = []
    predictionLabels = []
    for fold in range(0,n):
        clear_output(wait=True)
        print("starting fold",fold)
        testStart = round(fold*len(fasttextData)/n)
        testEnd = round((fold+1)*len(fasttextData)/n)
        trainFile = open(TRAIN,"w")
        testFile = open(TEST,"w")
        testData = []
        for i in range(0,len(fasttextData)):
            data = LABELPREFIX+fasttextData[i][POLARITY]+" "+fasttextData[i][TOKENIZEDTEXT]
            if i < testStart or i >= testEnd: 
                print(data,file=trainFile)
            else: 
                print(data,file=testFile)
                testData.append(data)
        testFile.close()
        trainFile.close()
        model = fasttext.train_supervised(TRAIN,dim=dim,epoch=epoch,lr=lr) #,pretrainedVectors=PRETRAINEDDIR+WIKIFILENAME)
        predictionCounts.append([*model.test(TEST)])
        predictionLabels.extend(model.predict(testData)[0])
        os.unlink(TRAIN)
        os.unlink(TEST)
    clear_output(wait=True)
    print("finished")
    return(predictionCounts,predictionLabels)

In [84]:
predictionCounts,predictionLabels = runFasttext(fasttextData,dim=300,epoch=50)

finished


In [108]:
def showOverallPrecision(predictionCounts):
    caseTotal = 0
    pTotal = 0
    rTotal = 0
    for i in range(0,len(predictionCounts)):
        caseTotal += predictionCounts[i][0]
        pTotal += predictionCounts[i][0]*predictionCounts[i][1]
        rTotal += predictionCounts[i][0]*predictionCounts[i][2]
    print("cases: {0}; precision: {1}; recall: {2}".format(caseTotal,round(pTotal/caseTotal,3),round(rTotal/caseTotal,3)))
    
showOverallPrecision(predictionCounts)

cases: 6361; precision: 0.372; recall: 0.372


In [121]:
def countGoldLabels(fasttextData):
    labelCountsGold = {}
    for i in range(0,len(fasttextData)):
        label = fasttextData[i][POLARITY]
        if label in labelCountsGold: labelCountsGold[label] += 1
        else: labelCountsGold[label] = 1
    return(labelCountsGold)

labelCountsGold = countGoldLabels(fasttextData)
labelCountsGold

{'Positive': 2364,
 'Negative': 921,
 'Irrelevant': 716,
 'Neutral': 1061,
 'Not_clear': 706}

In [88]:
def getBaselineAccuracy(labelCountsGold):
    return(round(max(labelCountsGold.values())/sum(labelCountsGold.values()),3))

print("baseline accuracy:",getBaselineAccuracy(labelCountsGold))

baseline accuracy: 0.41


In [120]:
def countPredictedLabels(predictionLabels):
    labelCountsPredicted = {}
    for i in range(0,len(predictionLabels)):
        label = predictionLabels[i][0]
        if label in labelCountsPredicted: labelCountsPredicted[label] += 1
        else: labelCountsPredicted[label] = 1
    return(labelCountsPredicted)

labelCountsPredicted = countPredictedLabels(predictionLabels)
labelCountsPredicted

{'__label__Neutral': 1068,
 '__label__Negative': 946,
 '__label__Not_clear': 650,
 '__label__Irrelevant': 716,
 '__label__Positive': 2981}

In [90]:
def makeConfusionMatrix(fasttextData,predictionLabels):
    goldLabels = pd.Series([fasttextData[i][POLARITY] for i in range(0,len(fasttextData))])
    predictedLabels = pd.Series([predictionLabels[i][0] for i in range(0,len(predictionLabels))])
    return(pd.crosstab(goldLabels,predictedLabels))

makeConfusionMatrix(fasttextData,predictionLabels)

col_0,__label__Irrelevant,__label__Negative,__label__Neutral,__label__Not_clear,__label__Positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Irrelevant,192,94,111,84,235
Negative,72,224,144,88,393
Neutral,100,125,333,135,368
Not_clear,80,85,157,100,284
Positive,160,288,303,196,1417


In [91]:
print("total absolute deviation",sum([abs(labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l]) for l in labelCountsGold]))

total absolute deviation 666


In [92]:
print("relative deviation per label:",\
      {l:round(abs(labelCountsPredicted[LABELPREFIX+l]-labelCountsGold[l])/labelCountsGold[l],3) for l in labelCountsGold})

relative deviation per label: {'Positive': 0.141, 'Negative': 0.114, 'Irrelevant': 0.156, 'Neutral': 0.012, 'Not_clear': 0.146}


# Combine stance data sets with different topics

In [125]:
TOPIC = "mondkapje"
FILETWEETS = TOPIC+"-tweets.csv"
FILEANNOTATIONS = "human-labels-"+TOPIC+"-tweets.txt"
LABEL = "label"
TEXT = "text"
USER = "user"

In [126]:
def readAnnotations(inFileName):
    return(pd.read_csv(inFileName,header=None,sep=" "))

dfAnnotations = readAnnotations(FILEANNOTATIONS)

In [127]:
def readTweets(inFileName):
    return(pd.read_csv(inFileName,header=None,index_col=0))

dfTweets = readTweets(FILETWEETS)

In [128]:
def convertLabel(label):
    label = re.sub("NEUTRAL","Neutral",label)
    label = re.sub("POSITIVE","Positive",label)
    label = re.sub("NEGATIVE","Negative",label)
    label = re.sub("IRRELEVANT","Irrelevant",label)
    return(label)

def makeFasttextData(dfAnnotations,dfTweets):
    fasttextData = []
    seen = {}
    for i in range(0,len(dfAnnotations)):
        tweetId = dfAnnotations.iloc[i][2]
        if tweetId in list(dfTweets.index):
            tweetLabel = convertLabel(dfAnnotations.iloc[i][4])
            tweetUser = dfTweets.loc[tweetId][1]
            tokenizedText = " ".join(TweetTokenizer().tokenize(dfTweets.loc[tweetId][2])).lower()
            if not tokenizedText in seen:
                seen[tokenizedText] = True
                fasttextData.append({IDSTR:tweetId,POLARITY:tweetLabel,TOKENIZEDTEXT:tokenizedText})
    return(fasttextData)

fasttextDataTopic = makeFasttextData(dfAnnotations,dfTweets)

In [129]:
fasttextDataCombined = fasttextData+fasttextDataTopic

In [130]:
len(fasttextData),len(fasttextDataTopic),len(fasttextDataCombined)

(5768, 593, 6361)

In [131]:
predictionCounts,predictionLabels = runFasttext(fasttextDataCombined,dim=300,epoch=50)

finished


In [132]:
showOverallPrecision(predictionCounts)

cases: 6361; precision: 0.369; recall: 0.369


In [133]:
labelCountsGold = countGoldLabels(fasttextData)
labelCountsGold

{'Positive': 2364,
 'Negative': 921,
 'Irrelevant': 716,
 'Neutral': 1061,
 'Not_clear': 706}

In [134]:
print("baseline accuracy:",getBaselineAccuracy(labelCountsGold))

baseline accuracy: 0.41


In [135]:
labelCountsPredicted = countPredictedLabels(predictionLabels)
labelCountsPredicted

{'__label__Neutral': 1071,
 '__label__Negative': 938,
 '__label__Not_clear': 663,
 '__label__Irrelevant': 707,
 '__label__Positive': 2982}

In [138]:
makeConfusionMatrix(fasttextDataCombined[5768:],predictionLabels[5768:])

col_0,__label__Irrelevant,__label__Negative,__label__Neutral,__label__Not_clear,__label__Positive
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Irrelevant,38,44,21,35,98
Negative,19,54,21,32,146
Neutral,8,15,6,6,18
Positive,4,4,1,6,17


In [139]:
len(fasttextDataCombined)

6361

In [140]:
len(predictionLabels)

6361