# Double-check tweet language with TextCat

Source: http://www.let.rug.nl/~vannoord/TextCat/

In [2]:
import os
import pandas as pd
import pipes
import re
import tempfile
from nltk.classify import textcat
from IPython.display import clear_output
from langdetect import detect,DetectorFactory

DetectorFactory.seed = 0

In [19]:
DATADIR = "/home/erikt/projects/puregome/data/text-202006/"
DATADIRTEXTCAT = "/home/erikt/projects/puregome/data/textcat/"
IDSTR = "id_str"
TEXT = "text"
TEXTCATDIR = "/home/erikt/software/textcat/"
TEXTCAT = "text_cat"
PERL = "/usr/bin/perl"
TEXTUNKNOWN = "I don't know; Perhaps this is a language I haven't seen before?"
UNKNOWN = "unknown"
OR = " or "
DUTCH = "dutch"
LANG = "lang"
OTHER = "other"
DUTCHTEXTCAT = "nld"

In [4]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: print(text)

In [5]:
def createPipe(command):
    pipe  = pipes.Template()
    pipe.append(command,"--")
    return(pipe)

def writeAllPipeInput(pipe,text):
    tmpFile = tempfile.NamedTemporaryFile(delete=False)
    f = pipe.open(tmpFile.name,"w")
    f.write(text)
    f.close()
    return(tmpFile.name)

def readPipeOutput(tmpFileName):
    tmpFile = open(tmpFileName,"r")
    textOut = ""
    for line in tmpFile: textOut += line
    tmpFile.close()
    os.unlink(tmpFileName)
    return(textOut)

def textcatString2List(text):
    if text == TEXTUNKNOWN: return([UNKNOWN])
    else: return(text.split(OR))

def textcatLocal(textIn):
    pipe = createPipe(PERL+" "+TEXTCATDIR+TEXTCAT)
    tmpFileName = writeAllPipeInput(pipe,textIn)
    textOut = readPipeOutput(tmpFileName)
    return(textcatString2List(textOut.strip()))

## Language detection with TextCat

In [20]:
FILEPATTERN = "20200[6]..-07"
MAXTWEETS = 200

tc = textcat.TextCat()
inFiles = sorted(os.listdir(DATADIR))
outFiles = sorted(os.listdir(DATADIRTEXTCAT))
for inFileName in inFiles:
    if re.search(FILEPATTERN,inFileName): # and not inFileName in outFiles:
        df = pd.read_csv(DATADIR+inFileName,index_col=IDSTR)[:MAXTWEETS]
        dutchCount = 0
        otherCount = 0
        dutchPercentage = 0
        lang = []
        seen = {}
        for i in range(0,len(df)):
            text = re.sub(r"\\n"," ",df.iloc[i][TEXT])
            if text in seen: textcatOut = seen[text]
            else:
                textcatOut = textcatLocal(text.lower())
                seen[text] = textcatOut
            if DUTCH in textcatOut: 
                lang.append(DUTCH)
                dutchCount += 1
            elif UNKNOWN in textcatOut: 
                lang.append(UNKNOWN)
            else: 
                lang.append(textcatOut[0])
                otherCount += 1
            if dutchCount > 0: 
                dutchPercentage = round(dutchCount/(dutchCount+otherCount),3)
        df[LANG] = lang
        df.to_csv(DATADIRTEXTCAT+inFileName,compression="gzip")
        print("{0} {1} {2:0.3f} {3}".format(i+1,len(df),dutchPercentage,inFileName))

200 200 0.980 20200601-07.out.gz
200 200 0.956 20200602-07.out.gz
200 200 0.986 20200603-07.out.gz
200 200 0.960 20200604-07.out.gz
200 200 0.969 20200605-07.out.gz
200 200 0.979 20200606-07.out.gz
200 200 0.969 20200607-07.out.gz
200 200 0.954 20200608-07.out.gz
200 200 0.968 20200609-07.out.gz
200 200 0.922 20200610-07.out.gz
200 200 0.944 20200611-07.out.gz
200 200 0.908 20200612-07.out.gz
200 200 0.911 20200613-07.out.gz
200 200 0.902 20200614-07.out.gz
200 200 0.866 20200615-07.out.gz
200 200 0.901 20200616-07.out.gz
200 200 0.889 20200617-07.out.gz
200 200 0.854 20200618-07.out.gz
200 200 0.914 20200619-07.out.gz
200 200 0.873 20200620-07.out.gz
200 200 0.808 20200621-07.out.gz
200 200 0.844 20200622-07.out.gz
200 200 0.897 20200623-07.out.gz
200 200 0.894 20200624-07.out.gz
200 200 0.902 20200625-07.out.gz
200 200 0.928 20200626-07.out.gz
200 200 0.926 20200627-07.out.gz
200 200 0.941 20200628-07.out.gz
200 200 0.915 20200629-07.out.gz
200 200 0.858 20200630-07.out.gz


## Language detection with langdetect

In [None]:
FILEPATTERN = "202006..-05"
MAXTWEETS = 200

tc = textcat.TextCat()
inFiles = sorted(os.listdir(DATADIR))
outFiles = sorted(os.listdir(DATADIRTEXTCAT))
for inFileName in inFiles:
    if re.search(FILEPATTERN,inFileName) and not inFileName in outFiles:
        df = pd.read_csv(DATADIR+inFileName,index_col=IDSTR)[:MAXTWEETS]
        dutchCount = 0
        otherCount = 0
        dutchPercentage = 0
        lang = []
        for i in range(0,len(df)):
            text = re.sub(r"\\n"," ",df.iloc[i][TEXT])
            try:
                textcatOut = detect(text.lower())
            except: continue
            if textcatOut == "nl":
                lang.append(DUTCH)
                dutchCount += 1
            else: 
                lang.append(textcatOut)
                otherCount += 1
            if dutchCount > 0: 
                dutchPercentage = round(dutchCount/(dutchCount+otherCount),3)
        df[LANG] = lang
        df.to_csv(DATADIRTEXTCAT+inFileName,compression="gzip")
        print("{0} {1} {2:0.3f} {3}".format(i+1,len(df),dutchPercentage,inFileName))