In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report


import pandas as pd

from collections import Counter

from utils import transform_label, transform_probs, CLASS_MAP

In [9]:
df = pd.read_json("./data/processed/en/train.json", orient="records", lines=True)
df.head()

Unnamed: 0,docid,tokens,tags,labels
0,file1,"[1, dead,, 18, hurt, in, explosion, at, natura...","[B-CASUALTIES-ARG, I-CASUALTIES-ARG, I-CASUALT...",{'MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT': 5}
1,file10,"[4, killed, in, mishap., content:Abohar,, Febr...","[B-CASUALTIES-ARG, I-CASUALTIES-ARG, O, B-MAN_...",{'MAN_MADE_EVENT.VEHICULAR_COLLISION': 3}
2,file100,"[Srinagar, gunfight, ends,, 2, Lashkar, men, k...","[B-PLACE-ARG, B-MAN_MADE_EVENT.SHOOT_OUT, O, B...","{'MAN_MADE_EVENT.SHOOT_OUT': 10, 'MAN_MADE_EVE..."
3,file101,"[Magnitude, 6.5, earthquake, strikes, Nicaragu...","[O, B-MAGNITUDE-ARG, B-NATURAL_EVENT.EARTHQUAK...",{'NATURAL_EVENT.EARTHQUAKE': 2}
4,file102,"[BSF, head, constable, killed, in, cross-borde...","[B-CASUALTIES-ARG, I-CASUALTIES-ARG, I-CASUALT...","{'MAN_MADE_EVENT.SHOOT_OUT': 10, 'MAN_MADE_EVE..."


In [10]:
df.shape

(828, 4)

In [11]:
df_train, df_valid = train_test_split(df, test_size=0.10, random_state=42)

In [12]:
df_train.shape, df_valid.shape

((745, 4), (83, 4))

In [13]:
Counter({"a": 5}) + Counter({"a": 2, "b": 3})

Counter({'a': 7, 'b': 3})

In [14]:
sum([Counter(l) for l in df.labels], Counter())

Counter({'MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT': 120,
         'MAN_MADE_EVENT.VEHICULAR_COLLISION': 643,
         'MAN_MADE_EVENT.SHOOT_OUT': 341,
         'MAN_MADE_EVENT.TERRORIST_ATTACK': 67,
         'NATURAL_EVENT.EARTHQUAKE': 256,
         'MAN_MADE_EVENT.SURGICAL_STRIKES': 106,
         'MAN_MADE_EVENT.RIOTS': 15,
         'MAN_MADE_EVENT.NORMAL_BOMBING': 153,
         'MAN_MADE_EVENT.FIRE': 806,
         'MAN_MADE_EVENT.TRANSPORT_HAZARDS': 323,
         'MAN_MADE_EVENT.TRAIN_COLLISION': 109,
         'NATURAL_EVENT.FLOODS': 158,
         'NATURAL_EVENT.EPIDEMIC': 46,
         'NATURAL_EVENT.VOLCANO': 88,
         'NATURAL_EVENT.AVALANCHES': 33,
         'NATURAL_EVENT.LIMNIC_ERRUPTIONS': 2,
         'NATURAL_EVENT.TORNADO': 52,
         'MAN_MADE_EVENT.AVIATION_HAZARD': 78,
         'NATURAL_EVENT.COLD_WAVE': 23,
         'NATURAL_EVENT.LAND_SLIDE': 64,
         'NATURAL_EVENT.STORM': 104,
         'NATURAL_EVENT.HURRICANE': 35,
         'NATURAL_EVENT.CYCLONE': 87,
         'NA

In [15]:
df.labels.apply(pd.Series).idxmax(axis=1)

0      MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT
1      MAN_MADE_EVENT.VEHICULAR_COLLISION
2                MAN_MADE_EVENT.SHOOT_OUT
3                NATURAL_EVENT.EARTHQUAKE
4                MAN_MADE_EVENT.SHOOT_OUT
                      ...                
823                 NATURAL_EVENT.DROUGHT
824        MAN_MADE_EVENT.TRAIN_COLLISION
825        MAN_MADE_EVENT.TRAIN_COLLISION
826               NATURAL_EVENT.HEAT_WAVE
827              NATURAL_EVENT.AVALANCHES
Length: 828, dtype: object

In [16]:
(~df.labels.apply(pd.Series).isnull()).sum(axis=1).value_counts()

1    672
2    118
3     30
4      8
dtype: int64

In [17]:
X_train = df_train["tokens"]
y_train = df_train.labels.apply(pd.Series).idxmax(axis=1)

X_valid = df_valid["tokens"]
y_valid = df_valid.labels.apply(pd.Series).idxmax(axis=1)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [19]:
clf = Pipeline([
    ("tfidf", TfidfVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)),
    ("model", RandomForestClassifier(
        #min_samples_split=4, min_samples_leaf=4
    ))
])
clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=<function <lambda> at 0x00000152D56A6C80>,
                                 smooth_idf=True, stop_words=None,
                                 strip_accents=None,...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0

In [20]:
y_pred = clf.predict(X_train)
print(classification_report(y_true=y_train, y_pred=y_pred))

                                    precision    recall  f1-score   support

          MAN_MADE_EVENT.ACCIDENTS       0.96      0.99      0.98       132
    MAN_MADE_EVENT.ARMED_CONFLICTS       1.00      1.00      1.00         1
    MAN_MADE_EVENT.AVIATION_HAZARD       1.00      1.00      1.00        10
              MAN_MADE_EVENT.CRIME       1.00      1.00      1.00        18
               MAN_MADE_EVENT.FIRE       0.98      1.00      0.99       102
MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT       1.00      1.00      1.00        14
      MAN_MADE_EVENT.MISCELLANEOUS       1.00      0.91      0.95        11
     MAN_MADE_EVENT.NORMAL_BOMBING       1.00      1.00      1.00        19
              MAN_MADE_EVENT.RIOTS       1.00      1.00      1.00         1
          MAN_MADE_EVENT.SHOOT_OUT       1.00      0.98      0.99        50
     MAN_MADE_EVENT.SUICIDE_ATTACK       1.00      1.00      1.00        10
   MAN_MADE_EVENT.SURGICAL_STRIKES       1.00      1.00      1.00        12
   MAN_MADE

In [21]:
y_pred = clf.predict(X_valid)
print(classification_report(y_true=y_valid, y_pred=y_pred))

                                    precision    recall  f1-score   support

          MAN_MADE_EVENT.ACCIDENTS       0.26      0.62      0.36        13
    MAN_MADE_EVENT.AVIATION_HAZARD       0.00      0.00      0.00         3
              MAN_MADE_EVENT.CRIME       0.00      0.00      0.00         2
               MAN_MADE_EVENT.FIRE       0.71      1.00      0.83        12
MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT       0.50      0.50      0.50         2
      MAN_MADE_EVENT.MISCELLANEOUS       1.00      0.50      0.67         2
     MAN_MADE_EVENT.NORMAL_BOMBING       0.00      0.00      0.00         3
              MAN_MADE_EVENT.RIOTS       0.00      0.00      0.00         1
          MAN_MADE_EVENT.SHOOT_OUT       0.67      1.00      0.80         2
     MAN_MADE_EVENT.SUICIDE_ATTACK       0.00      0.00      0.00         1
   MAN_MADE_EVENT.SURGICAL_STRIKES       0.00      0.00      0.00         2
    MAN_MADE_EVENT.TRAIN_COLLISION       0.75      1.00      0.86         3
  MAN_MADE_

  'precision', 'predicted', average, warn_for)


In [22]:
y_pred = clf.predict(X_train)
print(classification_report(y_true=transform_label(y_train), y_pred=transform_label(y_pred)))

                precision    recall  f1-score   support

MAN_MADE_EVENT       1.00      1.00      1.00       577
 NATURAL_EVENT       1.00      0.99      1.00       168

      accuracy                           1.00       745
     macro avg       1.00      1.00      1.00       745
  weighted avg       1.00      1.00      1.00       745



In [23]:
y_pred = clf.predict(X_valid)
print(classification_report(y_true=transform_label(y_valid), y_pred=transform_label(y_pred)))

                precision    recall  f1-score   support

MAN_MADE_EVENT       0.90      1.00      0.95        64
 NATURAL_EVENT       1.00      0.63      0.77        19

      accuracy                           0.92        83
     macro avg       0.95      0.82      0.86        83
  weighted avg       0.92      0.92      0.91        83



In [24]:
clf_map = {
    c: c.split(".")[0]
    for c in clf.classes_
}

In [25]:
clf.predict_proba(X_valid)
clf.classes_

array(['MAN_MADE_EVENT.ACCIDENTS', 'MAN_MADE_EVENT.ARMED_CONFLICTS',
       'MAN_MADE_EVENT.AVIATION_HAZARD', 'MAN_MADE_EVENT.CRIME',
       'MAN_MADE_EVENT.FIRE', 'MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT',
       'MAN_MADE_EVENT.MISCELLANEOUS', 'MAN_MADE_EVENT.NORMAL_BOMBING',
       'MAN_MADE_EVENT.RIOTS', 'MAN_MADE_EVENT.SHOOT_OUT',
       'MAN_MADE_EVENT.SUICIDE_ATTACK', 'MAN_MADE_EVENT.SURGICAL_STRIKES',
       'MAN_MADE_EVENT.TERRORIST_ATTACK',
       'MAN_MADE_EVENT.TRAIN_COLLISION',
       'MAN_MADE_EVENT.TRANSPORT_HAZARDS',
       'MAN_MADE_EVENT.VEHICULAR_COLLISION', 'NATURAL_EVENT.AVALANCHES',
       'NATURAL_EVENT.BLIZZARD', 'NATURAL_EVENT.COLD_WAVE',
       'NATURAL_EVENT.CYCLONE', 'NATURAL_EVENT.EARTHQUAKE',
       'NATURAL_EVENT.EPIDEMIC', 'NATURAL_EVENT.FLOODS',
       'NATURAL_EVENT.FOREST_FIRE', 'NATURAL_EVENT.HAIL_STORMS',
       'NATURAL_EVENT.HEAT_WAVE', 'NATURAL_EVENT.HURRICANE',
       'NATURAL_EVENT.LAND_SLIDE', 'NATURAL_EVENT.STORM',
       'NATURAL_EVENT.TORNADO', 

In [26]:
y_pred = clf.predict_proba(X_train)
print(classification_report(y_true=transform_label(y_train), y_pred=transform_probs(y_pred, clf)))

                precision    recall  f1-score   support

MAN_MADE_EVENT       0.99      1.00      1.00       577
 NATURAL_EVENT       1.00      0.98      0.99       168

      accuracy                           0.99       745
     macro avg       1.00      0.99      0.99       745
  weighted avg       0.99      0.99      0.99       745



In [27]:
y_pred = clf.predict_proba(X_valid)
print(classification_report(y_true=transform_label(y_valid), y_pred=transform_probs(y_pred, clf)))

                precision    recall  f1-score   support

MAN_MADE_EVENT       0.93      1.00      0.96        64
 NATURAL_EVENT       1.00      0.74      0.85        19

      accuracy                           0.94        83
     macro avg       0.96      0.87      0.91        83
  weighted avg       0.94      0.94      0.94        83



In [28]:
import sklearn_crfsuite


In [64]:
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        #postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]


def df_to_Xy(df):
    sentences = [list(zip(*row[["tokens", "tags"]])) for i, row in df.iterrows()]
    X = [sent2features(s) for s in sentences]
    y = [sent2labels(s) for s in sentences]
    return X, y

In [42]:
df_train.loc[0, ["tokens", "tags"]]

tokens    [1, dead,, 18, hurt, in, explosion, at, natura...
tags      [B-CASUALTIES-ARG, I-CASUALTIES-ARG, I-CASUALT...
Name: 0, dtype: object

In [43]:
list(zip(*df_train.loc[0, ["tokens", "tags"]]))

[('1', 'B-CASUALTIES-ARG'),
 ('dead,', 'I-CASUALTIES-ARG'),
 ('18', 'I-CASUALTIES-ARG'),
 ('hurt', 'I-CASUALTIES-ARG'),
 ('in', 'O'),
 ('explosion', 'B-MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT'),
 ('at', 'O'),
 ('natural', 'B-PLACE-ARG'),
 ('gas', 'I-PLACE-ARG'),
 ('plant', 'I-PLACE-ARG'),
 ('An', 'O'),
 ('explosion', 'B-MAN_MADE_EVENT.INDUSTRIAL_ACCIDENT'),
 ('on', 'O'),
 ('Tuesday', 'B-TIME-ARG'),
 ('at', 'O'),
 ('a', 'O'),
 ('natural', 'B-PLACE-ARG'),
 ('gas', 'I-PLACE-ARG'),
 ('facility', 'I-PLACE-ARG'),
 ('near', 'I-PLACE-ARG'),
 ('Austria’s', 'I-PLACE-ARG'),
 ('border', 'I-PLACE-ARG'),
 ('with', 'I-PLACE-ARG'),
 ('Slovakia', 'I-PLACE-ARG'),
 ('left', 'O'),
 ('one', 'B-CASUALTIES-ARG'),
 ('person', 'I-CASUALTIES-ARG'),
 ('dead,', 'I-CASUALTIES-ARG'),
 ('authorities', 'O'),
 ('said.', 'O'),
 ('A', 'O'),
 ('further', 'O'),
 ('18', 'B-CASUALTIES-ARG'),
 ('people', 'I-CASUALTIES-ARG'),
 ('were', 'I-CASUALTIES-ARG'),
 ('injured', 'I-CASUALTIES-ARG'),
 ('in', 'B-TIME-ARG'),
 ('the', 'I-TIME-A

In [44]:
for i, row in df_train.iterrows():
    print(row)
    sent = list(zip(*row[["tokens", "tags"]]))
    print(sent)
    break

docid                                               file290
tokens    [10, die, in, Egypt, church, attack., Ten, per...
tags      [B-CASUALTIES-ARG, I-CASUALTIES-ARG, O, B-PLAC...
labels               {'MAN_MADE_EVENT.TERRORIST_ATTACK': 7}
Name: 212, dtype: object
[('10', 'B-CASUALTIES-ARG'), ('die', 'I-CASUALTIES-ARG'), ('in', 'O'), ('Egypt', 'B-PLACE-ARG'), ('church', 'I-PLACE-ARG'), ('attack.', 'B-MAN_MADE_EVENT.TERRORIST_ATTACK'), ('Ten', 'B-CASUALTIES-ARG'), ('persons', 'I-CASUALTIES-ARG'), ('were', 'I-CASUALTIES-ARG'), ('killed', 'I-CASUALTIES-ARG'), ('and', 'I-CASUALTIES-ARG'), ('eight', 'I-CASUALTIES-ARG'), ('others', 'I-CASUALTIES-ARG'), ('injured', 'I-CASUALTIES-ARG'), ('in', 'O'), ('a', 'O'), ('terrorist', 'B-MAN_MADE_EVENT.TERRORIST_ATTACK'), ('attack', 'I-MAN_MADE_EVENT.TERRORIST_ATTACK'), ('on', 'O'), ('a', 'O'), ('church', 'B-PLACE-ARG'), ('in', 'I-PLACE-ARG'), ('Egypt’s', 'I-PLACE-ARG'), ('capital', 'I-PLACE-ARG'), ('today,', 'B-TIME-ARG'), ('the', 'O'), ('government', 

In [45]:
sent2features(list(zip(*df_train.loc[0, ["tokens", "tags"]])))[0]

{'bias': 1.0,
 'word.lower()': '1',
 'word[-3:]': '1',
 'word[-2:]': '1',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': True,
 'BOS': True,
 '+1:word.lower()': 'dead,',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False}

In [65]:
%%time
X_train, y_train = df_to_Xy(df_train)
X_valid, y_valid = df_to_Xy(df_valid)

Wall time: 1.18 s


In [54]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='ap',
    max_iterations=100,
    all_possible_transitions=False
)
crf.fit(X_train, y_train)

Wall time: 3min 51s


CRF(algorithm='ap', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [66]:
y_pred = crf.predict(X_valid)

In [67]:
y_pred

[['O',
  'O',
  'O',
  'B-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'O',
  'B-PLACE-ARG',
  'B-NATURAL_EVENT.LAND_SLIDE',
  'B-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'O',
  'O',
  'B-NATURAL_EVENT.LAND_SLIDE',
  'O',
  'B-PLACE-ARG',
  'I-PLACE-ARG',
  'I-PLACE-ARG',
  'I-PLACE-ARG',
  'O',
  'B-TIME-ARG',
  'I-TIME-ARG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-NATURAL_EVENT.LAND_SLIDE',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MAN_MADE_EVENT.MISCELLANEOUS',
  'O',
  'O',
  'O',
  'O',
  'B-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG',
  'I-CASUALTIES-ARG'

In [57]:
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [68]:
labels = list(crf.classes_)
labels.remove('O')
#labels.remove('P')

metrics.flat_f1_score(y_valid, y_pred,
                      average='weighted', labels=labels)

0.4479497315700026

In [69]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_valid, y_pred, labels=sorted_labels, digits=3
))

                                      precision    recall  f1-score   support

                 B-AFTER_EFFECTS-ARG      0.500     0.032     0.061        31
                 I-AFTER_EFFECTS-ARG      1.000     0.021     0.042       283
                    B-CASUALTIES-ARG      0.635     0.506     0.563       261
                    I-CASUALTIES-ARG      0.834     0.480     0.609      1514
                         B-DEPTH-ARG      0.333     0.167     0.222         6
                         I-DEPTH-ARG      0.800     0.200     0.320        20
                     B-EPICENTRE-ARG      1.000     0.333     0.500         3
                     I-EPICENTRE-ARG      1.000     0.160     0.276        25
                     B-INTENSITY-ARG      1.000     0.500     0.667         6
                     I-INTENSITY-ARG      0.000     0.000     0.000         4
                     B-MAGNITUDE-ARG      0.500     0.400     0.444        10
                     I-MAGNITUDE-ARG      0.333     0.500     0

In [63]:
df_test = pd.read_json("./data/processed/en/test.json", orient="records", lines=True)

In [71]:
import xml.etree.ElementTree as ET
from utils import get_entity_info

In [75]:
entity_info = get_entity_info(y_pred[0], df_valid.iloc[0]["tokens"])
entity_info

[{'tokens': ['5', 'children', 'killed'],
  'label': 'CASUALTIES-ARG',
  'start': 3,
  'end': 6,
  'entity_phrase': ' 5   children   killed '},
 {'tokens': ['Balochistan'],
  'label': 'PLACE-ARG',
  'start': 7,
  'end': 8,
  'entity_phrase': ' Balochistan '},
 {'tokens': ['landslide'],
  'label': 'NATURAL_EVENT.LAND_SLIDE',
  'start': 8,
  'end': 9,
  'entity_phrase': ' landslide '},
 {'tokens': ['A',
   'woman',
   'and',
   'her',
   'five',
   'children',
   'lost',
   'their',
   'lives'],
  'label': 'CASUALTIES-ARG',
  'start': 9,
  'end': 18,
  'entity_phrase': ' A   woman   and   her   five   children   lost   their   lives '},
 {'tokens': ['landslide'],
  'label': 'NATURAL_EVENT.LAND_SLIDE',
  'start': 20,
  'end': 21,
  'entity_phrase': ' landslide '},
 {'tokens': ['Harnai', 'town', 'of', 'Balochistan'],
  'label': 'PLACE-ARG',
  'start': 22,
  'end': 26,
  'entity_phrase': ' Harnai   town   of   Balochistan '},
 {'tokens': ['Sunday', 'night.'],
  'label': 'TIME-ARG',
  'start'

In [104]:
def generate_xml(entity_info, docid, simple=False):
    root = ET.Element("DOCUMENT") 
    docid_e = ET.Element("DOCID")
    docid_e.text = f"{docid}"
    root.append(docid_e) 
    for info in entity_info:
        label = info["label"]
        subtype = None
        if label.startswith(tuple(CLASS_MAP.keys())):
            label, subtype = label.split(".")
            label = CLASS_MAP[label]
        elif simple:
            continue
        if label.endswith("-ARG"):
            label = label.split("-ARG")[0]
            subtype = None
        elem = ET.Element(label.upper())
        if subtype:
            elem.attrib["TYPE"] = subtype.upper()
        elem.text = info["entity_phrase"]
        root.append(elem)
    return root

In [111]:
root = generate_xml(entity_info, 124, simple=True)

In [122]:
ET.tostring(root)

b'<DOCUMENT><DOCID>124</DOCID><NATURAL_EVENT TYPE="LAND_SLIDE"> landslide </NATURAL_EVENT><NATURAL_EVENT TYPE="LAND_SLIDE"> landslide </NATURAL_EVENT><NATURAL_EVENT TYPE="LAND_SLIDE"> landslide </NATURAL_EVENT><MAN_MADE_EVENT TYPE="MISCELLANEOUS"> fell </MAN_MADE_EVENT><MAN_MADE_EVENT TYPE="FIRE"> incident </MAN_MADE_EVENT></DOCUMENT>'

In [108]:
X_test, y_test = df_to_Xy(df_test)

In [110]:
y_pred = crf.predict(X_test)

In [113]:
from pathlib import Path

In [133]:
base_path = Path(f"./data/3Idiots/")
def create_files(df_test, lang):
    X_test, y_test = df_to_Xy(df_test)
    y_pred = crf.predict(X_test)
    for pred, (idx, row) in zip(y_pred, df_test.iterrows()):
        entity_info = get_entity_info(pred, row["tokens"])
        docid = row["docid"]
        for task, simple in enumerate([True, False], start=1):
            root = generate_xml(entity_info, docid, simple=simple)
            tree = ET.ElementTree(root) 
            out_path = base_path / f"Task_{task}"/ lang / f"{docid}.xml"
            #with open(out_path, "w+") as fp:
            tree.write(out_path, encoding="utf-8")

In [124]:
lang = "en"
create_files(df_test, lang)

In [127]:
for lang in ["en", "bn", "tm", "ma", "hn"]:
    for task in range(1,3):
        out_path = base_path / f"Task_{task}"/ lang
        out_path.mkdir(exist_ok=True)

In [136]:
["en", "bn", "tm", "ma", "hn"][:2]

['en', 'bn']

In [137]:
for lang in ["en", "bn", "tm", "ma", "hn"]:
    print(lang)
    df_train = pd.read_json(f"./data/processed/{lang}/train.json", orient="records", lines=True)
    X_train, y_train = df_to_Xy(df_train)
    print(df_train.shape)
    crf = sklearn_crfsuite.CRF(algorithm='ap', max_iterations=100, all_possible_transitions=False)
    print("Training")
    %time crf.fit(X_train, y_train)
    df_test = pd.read_json(f"./data/processed/{lang}/test.json", orient="records", lines=True)
    print("Writing")
    create_files(df_test, lang)

en
(828, 4)
Training
Wall time: 4min 16s
Writing
bn
(800, 4)
Training
Wall time: 4min 4s
Writing
