# Training CRF-based LMR model for IDRISI-R datasets

### 1. Load the data

In [None]:
import pandas as pd
import numpy as np
import SentenceGetter as SG
from sklearn_crfsuite import CRF
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

import scipy.stats
import nltk
import sklearn
from sklearn.metrics import make_scorer

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


In [None]:
def read_data(file_name):
    data = pd.read_csv(file_name, encoding="utf-8")
    data = data.fillna(method="ffill")
    words = list(set(data["Word"].values))
    tags = list(set(data["Tag"].values))
    getter = SG.SentenceGetter(data)
    sent, pos, tag = getter.get_next()
    sentences = getter.sentences
    return sentences, words, tags


### 2. Feature preparation

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
        
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })

    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]


### 3. Train and test the CRF model

In [None]:
def train_crf(X, y, cv, algorithm, c1, c2, max_iterations):#, all_possible_transitions):
    crf = CRF(algorithm=algorithm, c1=c1, c2=c2, 
              max_iterations=max_iterations)#, 
    pred = cross_val_predict(estimator=crf, X=X, y=y, cv=cv)
    crf.fit(X, y)
    return crf
    

In [None]:
def run_predict(crf, X_test):   
    return crf.predict(X_test)

### 4. Write Predictions to files in BIO-like format

In [None]:
def dump_predictions(output_path, tokens, labels):
    writer = open(output_path, 'w', encoding='utf-8', newline="")
    
    for i in range(len(labels)):
        t = [x for x, y, z in tokens[i]]
        for j in range(len(labels[i])):
            writer.write(t[j] + "\t" + labels[i][j] + "\n")
        writer.write("\n")

## Driver Code

In [None]:
def run(event, train_path, test_path, out_path):
    
    sentences_tr, words_tr, tags_tr = read_data(train_path)
    sentences_te, words_te, tags_te = read_data(test_path)

    X_train = [sent2features(s) for s in sentences_tr] 
    y_train = [sent2labels(s) for s in sentences_tr] 
    X_test = [sent2features(s) for s in sentences_te]
    y_test = [sent2labels(s) for s in sentences_te]

    crf = train_crf(X_train, y_train, 5, 'lbfgs', 0, 1, 100) #default parameters
    y_pred = crf.predict(X_test)
            
    dump_predictions(out_path, sentences_te, y_pred)
    


### `TODO` run the `prepare-data-4CRF` notebook before running this code


In [None]:
path = "C:\\Users\\QCRI-IMMRAN\\Desktop\\IDRISI\\data\\LMR\\" #<path to IDRISI data directory>
events = ["beirut_explosion_2020", "cairo_bombing_2019", "covid_2019", "dragon_storms_2020",
          "hafr_albatin_floods_2019", "jordan_floods_2018", "kuwait_floods_2018"]

for typ in ['typefull', 'typeless']:
    for case in ['random']:#, 'timebased']:
        for event in events:
            in_path = path + "AR\gold-" + case + "-bilou-crf\\" + typ + "\\" + event 
            train_path = in_path + "\\train.csv"
            test_path = in_path + "\\dev.csv"
            out_path = path + "AR\gold-" + case + "-bilou-crf\\" + typ + "\\" + event + "-predictions.txt"
            run(event, train_path, test_path, out_path)