In [None]:
import os
import csv
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
import os
import json
import requests
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tagger.default import DefaultTagger

In [None]:
def read_tweets(path):
    tokens = []
    labels = []
    t = []
    l = []
    for token in open(path, encoding='utf-8').read().splitlines(): 
        if token == '':
            tokens.append(t)
            labels.append(l)
            t = []
            l = []
            continue
        splits = token.split()
        t.append(splits[0])
        l.append(splits[1])

    if len(t) > 0 and len(l) > 0:
        tokens.append(t)
        labels.append(l)
        
    return tokens, labels

In [None]:
#tags: https://camel-tools.readthedocs.io/en/latest/reference/camel_morphology_features.html
tag_mapping = {"adj": "ADJ", "adposition": "ADP", "adverb": "ADV", "auxiliary": "AUX", 
               "coordinating conjunction": "CCONJ", "determiner": "DET", "interjection": "INTJ", 
               "noun": "NOUN", "numeral": "NUM", "particle": "PART", "pronoun": "PRON", 
               "proper noun": "PROPN", "punc": "PUNCT", "subordinating conjunction": "SCONJ", 
               "symbol": "SYM", "verb": "VERB", "other": "X"}

mled = MLEDisambiguator.pretrained()
tagger = DefaultTagger(mled, 'pos')
print(tagger.feature_list())

def get_ar_pos(tokens):
    pos = []
    for i in range(0, len(tokens)):
        t_pos = tagger.tag(tokens[i])
        if len(t_pos) != len(tokens[i]):
            print("mismatch in length")
        pos.append(t_pos)
    return pos

def get_en_pos(tokens):
    pos = []
    for i in range(0, len(tokens)):
        t_pos = []
        for e in nltk.pos_tag(tokens[i]):
            for p in e[1].split("\n"):
                t_pos.append(p)
        if len(t_pos) != len(tokens[i]):
            print("mismatch in length")
        pos.append(t_pos)
    return pos

In [None]:
def conver2crf(tokens, pos, labels, out_path):
    with open(out_path, mode='w', encoding="utf-8", newline="") as data_file:
        writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Sentence #","Word","POS","Tag"])
        for tweet in range(0, len(tokens)):
            begin = True
            for j in range(0, len(tokens[tweet])):
                if begin:
                    writer.writerow(["Sentence: " + str(tweet+1), tokens[tweet][j], pos[tweet][j], labels[tweet][j]])
                    begin = False
                else:
                    writer.writerow(["", tokens[tweet][j], pos[tweet][j], labels[tweet][j]])

In [None]:
def convert2typeless(labels):
    tllabels = []
    
    for i in range(len(labels)):
        tllabels.append([])
        for l in labels[i]:
            spls = l.split("-")
            if len(spls) > 1:
                tllabels[i].append(spls[0]+"-LOC")
            else:
                tllabels[i].append(l)
    
    return tllabels 

In [None]:
path = "<path to IDRISI data directory>" + "IDRISI\\data\\LMR\\"
events = ["beirut_explosion_2020", "cairo_bombing_2019", "covid_2019", "dragon_storms_2020",
          "hafr_albatin_floods_2019", "jordan_floods_2018", "kuwait_floods_2018"]

for typ in ['typefull', 'typeless']:
    for case in ['random', 'timebased']:
        for event in events:
            in_path = path + "AR\gold-" + case + "-bilou\\" + event 
            out_path = path + "AR\gold-" + case + "-bilou-crf\\" + typ + "\\" + event 
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            for prt in ["train", "dev", "test"]:
                prt_in_path = in_path + "\\" + prt + ".txt"
                tokens, labels = read_tweets(prt_in_path)
                pos = get_ar_pos(tokens)
                if typ == 'typeless':
                    labels = convert2typeless(labels)
                prt_out_path = out_path + "\\" + prt + ".csv"
                conver2crf(tokens, pos, labels, prt_out_path)


In [None]:
path = "<path to IDRISI data directory>" + "IDRISI\\data\\LMR\\"
events = ["california_wildfires_2018", "canada_wildfires_2016", "cyclone_idai_2019", "ecuador_earthquake_2016", 
          "greece_wildfires_2018", "hurricane_dorian_2019", "hurricane_florence_2018", "hurricane_harvey_2017", 
          "hurricane_irma_2017", "hurricane_maria_2017", "hurricane_matthew_2016", "italy_earthquake_aug_2016", 
          "kaikoura_earthquake_2016", "kerala_floods_2018", "maryland_floods_2018", "midwestern_us_floods_2019", 
          "pakistan_earthquake_2019", "puebla_mexico_earthquake_2017", "srilanka_floods_2017"]

for typ in ['typefull', 'typeless']:
    for case in ['random', 'timebased']:
        for event in events:

            in_path = path + "EN\gold-" + case + "-bilou\\" + event 
            out_path = path + "EN\gold-" + case + "-bilou-crf\\" + typ + "\\" + event 
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            for prt in ["train", "dev", "test"]:
                prt_in_path = in_path + "\\" + prt + ".txt"
                tokens, labels = read_tweets(prt_in_path)
                pos = get_en_pos(tokens)
                if typ == 'typeless':
                    labels = convert2typeless(labels)
                prt_out_path = out_path + "\\" + prt + ".csv"
                conver2crf(tokens, pos, labels, prt_out_path)