### Install requirements for smooth run

### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import nltk
import csv
import seaborn as sns

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

import scipy

import warnings
warnings.filterwarnings('ignore')
# Ignore warnings in the code execution

### Utility Functions

In [2]:
# function for creating the X and y for the CRF 
def transform_to_crf(dataframe, strategy = 'LO', feature_group = [], base_group = []):
  """dataframe = entire train or development set as pandas dataframe
  strategy = 'LO' if you want to leave a certain group out of the analysis
  strategy = 'ONLY' if you want to train solely on that list of features
  strategy = 'STACK' if you want to stack the basegroup to the list of features 
  feature_group = the list of features on which the strategy is performed
  If you run transform_to_crf(dataframe) it will automatically perform on all features"""
  

  column_values = dataframe[['annotator']].values.ravel()
  annotator_ids = pd.unique(column_values)
  total_cols = ['token_no_stop','lemma','pos','prev_lemma','next_lemma','prev_pos','next_pos','snowball_stemmer',
                'porter_stemmer','head','dependency','is_part_of_negation','has_prefix','has_postfix','has_infix',
                'base_in_dictionary','has_apostrophe']

  # print(dataframe.columns)
  # print('total_cols', total_cols)
  if strategy == 'LO':
    cols = [x for x in total_cols if x not in feature_group]
  elif strategy == 'ONLY':
    cols = feature_group
    print(cols)
  elif strategy == 'STACK':
    cols_dupl = base_group + feature_group
    cols = list(set(cols_dupl))


  X_ready = []
  y_ready = []

  for annotator in annotator_ids:  # for each annotator
      # get the data for the annotator
      annotator_data = dataframe[dataframe['annotator'] == annotator]
      # get the sentence ids
      column_values = annotator_data[['sentence_id']].values.ravel()
      sentence_ids = pd.unique(column_values)  # get the unique sentence ids

      for sent_id in sentence_ids:
        new_sentence = []
        sentence = annotator_data.loc[annotator_data['sentence_id']
                                            == sent_id]        
        y = sentence['label'].values.ravel()
        # Make y element str
        y = [str(x) for x in y]
        for i, token_features in sentence.iterrows():
          new_sentence.append(token_features[cols].to_dict())
        X_ready.append(new_sentence)
        y_ready.append(list(y))


  return X_ready, y_ready, cols

In [3]:
# function for creating the CRF classifier and predict the labels
def predictions_crf(X_train, y_train, X_test):

    crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
    )

    try:
        crf.fit(X_train, y_train)
    except AttributeError:
        pass

    predictions = crf.predict(X_test)
    
    return crf, predictions

In [4]:
def hyperparameter_search_crf(X_train, y_train, X_test):

    crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=200, 
    all_possible_transitions=True
    )

    params_space = { 
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
    }

    crf.fit(X_train, y_train)

    from sklearn_crfsuite import metrics
    labels = list(crf.classes_)

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

    # search
    rs = RandomizedSearchCV(crf, params_space,
                            cv=3,
                            verbose=1,
                            n_jobs=-1,
                            n_iter=20,
                            scoring=f1_scorer)
    rs.fit(X_train, y_train)

    # try:
    #     crf.fit(X_train, y_train)
    # except AttributeError:
    #     pass

    predictions = rs.predict(X_test)
    
    return rs, predictions

In [5]:
# function for extracting the f1 and confusion matrix from the predictions
def get_metrics(predictions, y_test, crf):
    from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix
    labels = list(crf.classes_)
    f1 = metrics.flat_f1_score(y_test, predictions,
                        average='macro', labels=labels)
    y_test_flatten = [y for x in y_test for y in x]
    predictions_flatten = [y for x in predictions for y in x]
    cm = confusion_matrix(y_test_flatten, predictions_flatten, labels)
    print('best params:', crf.best_params_)
    print('model size: {:0.2f}'.format(crf.best_estimator_.size_ / 1000000))
    return cm, f1


In [6]:
# function for extracting all confusion matrices, f1's and prediction lists for every group of features
def run_feature_analysis(strat = 'LO', base_group = []):
    """strat = 'LO' if you want to leave a certain group out of the analysis
    strat = 'ONLY' if you want to train solely on that list of features 
    strat = 'STACK' if you want to increase the base_group 
    default is 'LO' """
    
    print('strategy: ', strat)
    
    feature_groups = [['token_no_stop'],['lemma','pos','snowball_stemmer','porter_stemmer'],['next_lemma','next_pos'], 
    ['prev_lemma', 'prev_pos'], ['head','dependency'], ['is_part_of_negation'], ['has_prefix','has_postfix','has_infix'],
    ['base_in_dictionary'],['has_apostrophe']]

    cms = []
    f1s = []
    predictions_list = []
    cs  = []

    t = train.copy()
    d = dev.copy()

    for i, group in enumerate(feature_groups):
        X_train, y_train, cols = transform_to_crf(t,  strategy = strat, feature_group=group, base_group= base_group)
        X_test, y_test, cols = transform_to_crf(d, strategy = strat, feature_group=group, base_group = base_group)
        crf, predictions = predictions_crf(X_train, y_train, X_test)
        cm, f1 = get_metrics(predictions, y_test, crf)
        cms.append(cm)
        f1s.append(f1)
        predictions_list.append(predictions)
        cs.append(crf.best_params_)
        print(cols, ' has f1 score of ', f1)
        
    return cms, f1s, predictions_list, cs

In [None]:
def run_feature_analysis_with_hyperparameter_opt_extra(strat = 'LO', base_group = []):
    """strat = 'LO' if you want to leave a certain group out of the analysis
    strat = 'ONLY' if you want to train solely on that list of features 
    strat = 'STACK' if you want to increase the base_group 
    default is 'LO' """
    
    print('strategy: ', strat)
    
    feature_groups = [['lemma','pos','snowball_stemmer','porter_stemmer','is_part_of_negation', 'has_prefix','has_postfix','has_infix']]

    cms = []
    f1s = []
    css = []
    predictions_list = []

    for i, group in enumerate(feature_groups):
        X_train, y_train, cols = transform_to_crf(train,  strategy = strat, feature_group=group, base_group= base_group)
        X_test, y_test, cols = transform_to_crf(dev, strategy = strat, feature_group=group, base_group = base_group)
        crf, predictions = hyperparameter_search_crf(X_train, y_train, X_test)
        cm, f1 = get_metrics(predictions, y_test, crf)
        cms.append(cm)
        f1s.append(f1)
        css.append(crf.best_params_)
        predictions_list.append(predictions)
        if strat == "STACK": base_group = cols
        print(cols, ' has f1 score of ', f1)
        
    return cms, f1s, predictions_list, css, crf

## SEM 2012 Corpus

In [17]:
# Previous path: /content/drive/MyDrive/ATM /SEM2012_training_data_with_features.csv
# Change it to run it in a different drive
file_path =  'ATM/SEM2012_training_data_with_features.csv'
train = pd.read_csv(file_path, sep=",", header=0)


file_path_test =  'ATM/SEM2012_validation_data_with_features.csv'
dev = pd.read_csv(file_path_test, sep=",", header=0)

train.head(5)

Unnamed: 0,annotator,sentence_id,token_id,token,label,token_lower,token_no_punct,token_no_stop,lemma,pos,...,porter_stemmer,head,dependency,is_part_of_negation,has_prefix,has_postfix,has_infix,base,base_in_dictionary,has_apostrophe
0,baskervilles01,0,0,chapter,O,chapter,chapter,chapter,chapter,NOUN,...,chapter,5,nmod,0,False,False,False,chapter,True,False
1,baskervilles01,0,1,1.,O,1.,1,1,1,X,...,1.,1,nummod,0,False,False,False,1,False,False
2,baskervilles01,0,2,mr.,O,mr.,mr,mr,mr,PROPN,...,mr.,5,compound,0,False,False,False,mr,True,False
3,baskervilles01,0,3,sherlock,O,sherlock,sherlock,sherlock,sherlock,NOUN,...,sherlock,5,compound,0,False,False,False,sherlock,True,False
4,baskervilles01,0,4,holmes,O,holmes,holmes,holmes,holmes,PROPN,...,holm,0,ROOT,0,False,False,False,holmes,False,False


#### Run hyperparameter optimization with "ONLY" Strategy

In [None]:
# perform feature ablagation study
cms_only1, f1s_only1, predictions_list_only1, cs_ONLY1, model = run_feature_analysis_with_hyperparameter_opt_extra(strat='ONLY')

best params: {'c1': 0.2160021395340066, 'c2': 0.0013076159888168548}
model size: 0.04
['lemma', 'pos', 'snowball_stemmer', 'porter_stemmer', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix']  has f1 score of  0.8941584132126073

In [11]:
model

RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True, c1=0.1, c2=0.1,
                                 keep_tempfiles=None, max_iterations=200),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f1aeb525520>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f1aea2af9a0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['O', 'B-NEG', 'I-NEG']),
                   verbose=1)

## Execution for BioCorpus

Adapt the functions to the new Corpus as it does not have the head, dependency pair

In [None]:
file_path =  'biocorpus_training_data_with_features.csv'
train = pd.read_csv(file_path, sep=",", header=0, dtype={"annotator": "string", "sentence_id": int, "token_id":int})
train


file_path_test =  'biocorpus_validation_data_with_features.csv'
dev = pd.read_csv(file_path_test, sep=",", header=0)

train.head(5)

Unnamed: 0,annotator,sentence_id,token_id,token,label,token_lower,token_no_punct,token_no_stop,lemma,pos,...,porter_stemmer,head,dependency,is_part_of_negation,has_prefix,has_postfix,has_infix,base,base_in_dictionary,has_apostrophe
0,10092801,6,10,adenovirus,O,adenovirus,adenovirus,adenovirus,adenovirus,PROPN,...,adenoviru,3,attr,0,True,False,False,denovirus,False,False
1,91187152,3,7,environmental,O,environmental,environmental,environmental,environmental,ADJ,...,environment,7,amod,0,False,False,False,environmental,True,False
2,7522257,10,7,cd14,O,cd14,cd14,cd14,cd14,PROPN,...,cd14,8,nmod,0,False,False,False,cd14,False,False
3,92179751,3,78,incubation,O,incubation,incubation,incubation,incubation,NOUN,...,incub,59,dobj,0,True,False,False,cubation,False,False
4,7541794,5,41,for,O,for,for,for,for,ADP,...,for,28,prep,0,False,False,False,for,True,False


In [12]:
# function for creating the CRF classifier and predict the labels
def get_predictions(X_train, y_train, X_test, model):

    predictions = model.predict(X_test)
    
    return predictions

In [18]:
group = ['lemma','pos','snowball_stemmer','porter_stemmer','is_part_of_negation', 'has_prefix','has_postfix','has_infix']
X_train, y_train, cols = transform_to_crf(train,  strategy = "ONLY", feature_group=group, base_group= [])
X_test, y_test, cols = transform_to_crf(dev, strategy = "ONLY", feature_group=group, base_group = [])
preds = get_predictions(X_train, y_train, X_test, model)
get_metrics(preds, y_test, model)

['lemma', 'pos', 'snowball_stemmer', 'porter_stemmer', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix']
['lemma', 'pos', 'snowball_stemmer', 'porter_stemmer', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix']


best params: {'c1': 0.2160021395340066, 'c2': 0.0013076159888168548}
model size: 0.04
0.33291237949333724