# Fasttext tweet classification

## Data file creation

In [1]:
import pandas as pd
import re
from nltk.tokenize import TweetTokenizer

In [2]:
TOPIC = "mondkapje"
FILETWEETS = TOPIC+"-tweets.csv"
FILEANNOTATIONS = "human-labels-"+TOPIC+"-tweets.txt"
FILEFASTTEXT = "fasttext-"+TOPIC+".csv"
LABELPREFIX = "__label__"
LABEL = "label"
TEXT = "text"
USER = "user"

In [3]:
tweets = pd.read_csv(FILETWEETS,header=None,index_col=0)
annotations = pd.read_csv(FILEANNOTATIONS,header=None,sep=" ")

In [4]:
fasttextData = {}
for i in range(0,len(annotations)):
    tweetId = annotations.iloc[i][2]
    tweetLabel = annotations.iloc[i][4]
    if tweetId in list(tweets.index):
        tweetLabel = annotations.iloc[i][4]
        tweetUser = tweets.loc[tweetId][1]
        fasttextData[tweetId] = {LABEL:LABELPREFIX+tweetLabel,USER:tweetUser,TEXT:" ".join(TweetTokenizer().tokenize(tweets.loc[tweetId][2]))}

outFile = open(FILEFASTTEXT,"w")
seenTexts = {}
for tweetId in fasttextData:
    if not fasttextData[tweetId][TEXT] in seenTexts:
        print(fasttextData[tweetId][LABEL],fasttextData[tweetId][USER],fasttextData[tweetId][TEXT],file=outFile)
        seenTexts[fasttextData[tweetId][TEXT]] = True
outFile.close()

## Fasttext run

In [5]:
import fasttext
from IPython.display import clear_output

In [6]:
PRETRAINEDDIR = "/home/erikt/projects/newsgac/fasttext-runs/"
WIKIFILENAME = "wiki.nl.vec"

In [7]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [8]:
fasttextData = []
inFile = open(FILEFASTTEXT,"r")
for line in inFile: fasttextData.append(line.strip())
inFile.close()

In [37]:
DIM = 300
N = 10
TRAIN = "TRAIN"
TEST = "TEST"

results = []
labels = []
for fold in range(0,N):
    clear_output(wait=True)
    print("starting fold",fold)
    testStart = round(fold*len(fasttextData)/N)
    testEnd = round((fold+1)*len(fasttextData)/N)
    trainFile = open(TRAIN,"w")
    testFile = open(TEST,"w")
    testData = []
    for i in range(0,len(fasttextData)):
        if i < testStart or i >= testEnd: print(fasttextData[i],file=trainFile)
        else: 
            print(fasttextData[i],file=testFile)
            testData.append(fasttextData[i])
    testFile.close()
    trainFile.close()
    model = fasttext.train_supervised(TRAIN,dim=DIM,epoch=100,pretrainedVectors=PRETRAINEDDIR+WIKIFILENAME)
    results.append([*model.test(TEST)])
    labels.append(model.predict(testData))
clear_output(wait=True)
print("finished")

finished


In [38]:
caseTotal = 0
pTotal = 0
rTotal = 0
for i in range(0,len(results)):
    caseTotal += results[i][0]
    pTotal += results[i][0]*results[i][1]
    rTotal += results[i][0]*results[i][2]
print("cases: {0}; precision: {1}; recall: {2}".format(caseTotal,round(pTotal/caseTotal,3),round(rTotal/caseTotal,3)))

cases: 593; precision: 0.663; recall: 0.663


In [35]:
labelCounts = {}
for i in range(0,len(fasttextData)):
    label = fasttextData[i].split()[0]
    if label in labelCounts: labelCounts[label] += 1
    else: labelCounts[label] = 1

In [36]:
labelCounts

{'__label__NEUTRAL': 53,
 '__label__IRRELEVANT': 235,
 '__label__NEGATIVE': 273,
 '__label__POSITIVE': 32}

In [39]:
labelCounts = {}
for i in range(0,len(labels)):
    for label in labels[i][0]:
        if label[0] in labelCounts: labelCounts[label[0]] += 1
        else: labelCounts[label[0]] = 1

In [40]:
labelCounts

{'__label__NEUTRAL': 30,
 '__label__NEGATIVE': 325,
 '__label__IRRELEVANT': 230,
 '__label__POSITIVE': 8}

Fasttext predicts 64% of the labels correctly without external dictionary and 66% with a Wikipedia dictionary (baseline: 46%). It overestimates the presence of negative labels and underestimates the level of positive and neutral labels. The amount of irrelevant labels is about right