# Running CaMEL NER models

In [None]:
from camel_tools.ner import NERecognizer
from camel_tools.tokenizers.word import simple_word_tokenize

### 1. Load the data

In [None]:
def read_annotated_data(path):
    tokens = []
    labels = []
    t = []
    l = []
    
    for token in open(path, encoding='utf-8').read().splitlines(): 
        if token == '':
            tokens.append(t)
            labels.append(l)
            t = []
            l = []
            continue
        splits = token.split()
        t.append(splits[0])
        l.append(splits[1])
        
    if len(t) > 0 and len(l) > 0:
        tokens.append(t)
        labels.append(l)
        
    return tokens, labels

### 2. Running NER models

#### CAMEL

In [None]:
def run_camel(tokens, model_name):
    #print(tokens)
    ner = NERecognizer(model_name)

    y_pred = []
    y_tkns = []
    for token in tokens:
        pred = []
        tkns = []       
        sentence = simple_word_tokenize(" ".join(token))
        pred = ner.predict_sentence(sentence)
        y_pred.append(pred)
        y_tkns.append(sentence)
    return y_tkns, y_pred

#### FARASA 
https://farasa.qcri.org/NER/

`TODO` request an API key through FARASA website

In [None]:
import json
import requests
url = 'https://farasa.qcri.org/webapi/ner/'
api_key = "<PUT API KEY HERE>"

def detect_ner(text):
    payload = {'text': text, 'api_key': api_key}
    data = requests.post(url, data=payload)
    if "could not NER" == data.content.decode():
        result = "could not NER"
    else:
        result = json.loads(data.text)['text']
    return result

def run_farasa(tokens):
    y_pred = []
    y_tkns = []
    for token in tokens:
        pred = []
        tkns = []
        res = detect_ner(" ".join(token))
        if res == "could not NER":
            print("could not NER: " + " ".join(token))
            pred = ['O']*len(token)
            tkns = token
            continue
        for p in res:
            splits = p.split('/')
            pred.append(splits[1])
            tkns.append(splits[0])
        y_pred.append(pred)
        y_tkns.append(tkns)
    return y_tkns, y_pred

### 3. Write Predictions to files in BIO-like format

In [None]:
def remove_entities(labels):
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            if labels[i][j] not in ["B-LOC", "I-LOC", "O"]:
                labels[i][j] = "O"
            
    return labels

In [None]:
def dump_predictions(output_path, tokens, labels):
    writer = open(output_path, 'w', encoding='utf-8', newline="")
    
    for i in range(len(labels)):
        t = [x for x in tokens[i]]
        for j in range(len(labels[i])):
            writer.write(t[j] + "\t" + labels[i][j] + "\n")
        writer.write("\n")

## Driver Code

In [None]:
path = "<path to IDRISI data directory>\\IDRISI\\data\\LMR\\"
events = ["beirut_explosion_2020", "cairo_bombing_2019", "covid_2019", "dragon_storms_2020",
          "hafr_albatin_floods_2019", "jordan_floods_2018", "kuwait_floods_2018"]

ner = 'FARASA' #or 'CaMEL'

models = ['bert-base-arabic-camelbert-ca-ner', 'bert-base-arabic-camelbert-da-ner', 
          'bert-base-arabic-camelbert-mix-ner', 'bert-base-arabic-camelbert-msa-ner']         

for case in ['random', 'timebased']:
    for event in events:
        in_path = path + "AR\gold-" + case + "-bilou\\" + event 
        test_path = in_path + "\\dev.txt"
        x_true, y_true = read_annotated_data(test_path)
        # TODO: you can pick any of the model names in the `models` list above
        if ner == 'CaMEL':
            x_pred, y_pred = run_camel(x_true, "CAMeL-Lab/bert-base-arabic-camelbert-mix-ner")
            y_pred = remove_entities(y_pred) #to remove non-LOC entities
        else:
            x_pred, y_pred = run_farasa(x_true)
        
        out_path = path + "AR\gold-" + case + "-" + event + "-" + ner + "-predictions.txt"
        dump_predictions(out_path, x_pred, y_pred)
