### Install requirements for smooth run

### Imports

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import nltk
import csv
import seaborn as sns

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

import scipy

### Utility Functions

In [11]:
# function for creating the X and y for the CRF 
def transform_to_crf(dataframe, strategy = 'LO', feature_group = [], base_group = []):
  """dataframe = entire train or development set as pandas dataframe
  strategy = 'LO' if you want to leave a certain group out of the analysis
  strategy = 'ONLY' if you want to train solely on that list of features
  strategy = 'STACK' if you want to stack the basegroup to the list of features 
  feature_group = the list of features on which the strategy is performed
  If you run transform_to_crf(dataframe) it will automatically perform on all features"""
  

  column_values = dataframe[['annotator']].values.ravel()
  annotator_ids = pd.unique(column_values)
  total_cols = ['token_no_stop','lemma','pos','prev_lemma','next_lemma','prev_pos','next_pos','snowball_stemmer',
                'porter_stemmer','head','dependency','is_part_of_negation','has_prefix','has_postfix','has_infix',
                'base_in_dictionary','has_apostrophe']

  # print(dataframe.columns)
  # print('total_cols', total_cols)
  if strategy == 'LO':
    cols = [x for x in total_cols if x not in feature_group]
  elif strategy == 'ONLY':
    cols = feature_group
    print(cols)
  elif strategy == 'STACK':
    cols_dupl = base_group + feature_group
    cols = list(set(cols_dupl))


  X_ready = []
  y_ready = []

  for annotator in annotator_ids:  # for each annotator
      # get the data for the annotator
      annotator_data = dataframe[dataframe['annotator'] == annotator]
      # get the sentence ids
      column_values = annotator_data[['sentence_id']].values.ravel()
      sentence_ids = pd.unique(column_values)  # get the unique sentence ids

      for sent_id in sentence_ids:
        new_sentence = []
        sentence = annotator_data.loc[annotator_data['sentence_id']
                                            == sent_id]        
        y = sentence['label'].values.ravel()
        # Make y element str
        y = [str(x) for x in y]
        for i, token_features in sentence.iterrows():
          new_sentence.append(token_features[cols].to_dict())
        X_ready.append(new_sentence)
        y_ready.append(list(y))


  return X_ready, y_ready, cols

In [12]:
# function for creating the CRF classifier and predict the labels
def predictions_crf(X_train, y_train, X_test):

    crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
    )

    try:
        crf.fit(X_train, y_train)
    except AttributeError:
        pass

    predictions = crf.predict(X_test)
    
    return crf, predictions

In [13]:
def hyperparameter_search_crf(X_train, y_train, X_test):

    crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=200, 
    all_possible_transitions=True
    )

    params_space = { 
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
    }

    crf.fit(X_train, y_train)

    from sklearn_crfsuite import metrics
    labels = list(crf.classes_)

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

    # search
    rs = RandomizedSearchCV(crf, params_space,
                            cv=3,
                            verbose=1,
                            n_jobs=-1,
                            n_iter=20,
                            scoring=f1_scorer)
    rs.fit(X_train, y_train)

    # try:
    #     crf.fit(X_train, y_train)
    # except AttributeError:
    #     pass

    predictions = rs.predict(X_test)
    
    return rs, predictions

In [14]:
# function for extracting the f1 and confusion matrix from the predictions
def get_metrics(predictions, y_test, crf):
    from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix
    labels = list(crf.classes_)
    f1 = metrics.flat_f1_score(y_test, predictions,
                        average='macro', labels=labels)
    y_test_flatten = [y for x in y_test for y in x]
    predictions_flatten = [y for x in predictions for y in x]
    cm = confusion_matrix(y_test_flatten, predictions_flatten, labels)
    print('best params:', crf.best_params_)
    print('model size: {:0.2f}'.format(crf.best_estimator_.size_ / 1000000))
    return cm, f1


In [15]:
# function for extracting all confusion matrices, f1's and prediction lists for every group of features
def run_feature_analysis(strat = 'LO', base_group = []):
    """strat = 'LO' if you want to leave a certain group out of the analysis
    strat = 'ONLY' if you want to train solely on that list of features 
    strat = 'STACK' if you want to increase the base_group 
    default is 'LO' """
    
    print('strategy: ', strat)
    
    feature_groups = [['token_no_stop'],['lemma','pos','snowball_stemmer','porter_stemmer'],['next_lemma','next_pos'], 
    ['prev_lemma', 'prev_pos'], ['head','dependency'], ['is_part_of_negation'], ['has_prefix','has_postfix','has_infix'],
    ['base_in_dictionary'],['has_apostrophe']]

    cms = []
    f1s = []
    predictions_list = []
    cs  = []

    t = train.copy()
    d = dev.copy()

    for i, group in enumerate(feature_groups):
        X_train, y_train, cols = transform_to_crf(t,  strategy = strat, feature_group=group, base_group= base_group)
        X_test, y_test, cols = transform_to_crf(d, strategy = strat, feature_group=group, base_group = base_group)
        crf, predictions = predictions_crf(X_train, y_train, X_test)
        cm, f1 = get_metrics(predictions, y_test, crf)
        cms.append(cm)
        f1s.append(f1)
        predictions_list.append(predictions)
        cs.append(crf.best_params_)
        print(cols, ' has f1 score of ', f1)
        
    return cms, f1s, predictions_list, cs

In [16]:
# function for extracting all confusion matrices, f1's and prediction lists for every group of features
def run_feature_analysis_with_hyperparameter_opt(strat = 'LO', base_group = []):
    """strat = 'LO' if you want to leave a certain group out of the analysis
    strat = 'ONLY' if you want to train solely on that list of features 
    strat = 'STACK' if you want to increase the base_group 
    default is 'LO' """
    
    print('strategy: ', strat)
    
    feature_groups = [['token_no_stop'],['lemma','pos','snowball_stemmer','porter_stemmer'],['next_lemma','next_pos'], 
    ['prev_lemma', 'prev_pos'], ['head','dependency'], ['is_part_of_negation'], ['has_prefix','has_postfix','has_infix'],
    ['base_in_dictionary'],['has_apostrophe']]

    cms = []
    f1s = []
    predictions_list = []
    cs = []

    for i, group in enumerate(feature_groups):
        X_train, y_train, cols = transform_to_crf(train,  strategy = strat, feature_group=group, base_group= base_group)
        X_test, y_test, cols = transform_to_crf(dev, strategy = strat, feature_group=group, base_group = base_group)
        crf, predictions = hyperparameter_search_crf(X_train, y_train, X_test)
        cm, f1 = get_metrics(predictions, y_test, crf)
        cms.append(cm)
        f1s.append(f1)
        cs.append(crf.best_params_)
        predictions_list.append(predictions)
        # Make sure we stack to the base group. If the base group is always empty the strategy is same as only.
        if strat == 'STACK': base_group = cols
        print(cols, ' has f1 score of ', f1)
        
    return cms, f1s, predictions_list, cs

## SEM 2012 Corpus

In [70]:
# Previous path: /content/drive/MyDrive/ATM /SEM2012_training_data_with_features.csv
# Change it to run it in a different drive
file_path =  'ATM/SEM2012_training_data_with_features.csv'
train = pd.read_csv(file_path, sep=",", header=0)


file_path_test =  'ATM/SEM2012_validation_data_with_features.csv'
dev = pd.read_csv(file_path_test, sep=",", header=0)

train.head(5)

Unnamed: 0,annotator,sentence_id,token_id,token,label,token_lower,token_no_punct,token_no_stop,lemma,pos,...,porter_stemmer,head,dependency,is_part_of_negation,has_prefix,has_postfix,has_infix,base,base_in_dictionary,has_apostrophe
0,baskervilles01,0,0,chapter,O,chapter,chapter,chapter,chapter,NOUN,...,chapter,5,nmod,0,False,False,False,chapter,True,False
1,baskervilles01,0,1,1.,O,1.,1,1,1,X,...,1.,1,nummod,0,False,False,False,1,False,False
2,baskervilles01,0,2,mr.,O,mr.,mr,mr,mr,PROPN,...,mr.,5,compound,0,False,False,False,mr,True,False
3,baskervilles01,0,3,sherlock,O,sherlock,sherlock,sherlock,sherlock,NOUN,...,sherlock,5,compound,0,False,False,False,sherlock,True,False
4,baskervilles01,0,4,holmes,O,holmes,holmes,holmes,holmes,PROPN,...,holm,0,ROOT,0,False,False,False,holmes,False,False


In [9]:
import warnings
warnings.filterwarnings('ignore')
# Ignore warnings in the code execution

In [10]:
train.dtypes

annotator              object
sentence_id             int64
token_id                int64
token                  object
label                  object
token_lower            object
token_no_punct         object
token_no_stop          object
lemma                  object
pos                    object
prev_lemma             object
next_lemma             object
prev_pos               object
next_pos               object
snowball_stemmer       object
porter_stemmer         object
head                    int64
dependency             object
is_part_of_negation     int64
has_prefix               bool
has_postfix              bool
has_infix                bool
base                   object
base_in_dictionary       bool
has_apostrophe           bool
dtype: object

### Run hyperparameter optimization with "ONLY" Strategy

In [None]:
# perform feature ablagation study
cms_only, f1s_only, predictions_list_only, cs_only = run_feature_analysis_with_hyperparameter_opt(strat='ONLY')
# cms_LO, f1s_LO, predictions_LO = run_feature_analysis(strat='LO')

The following table contains the feature used by Only.

| Features                                               | C1      | C2                  | Size(M) | F1    |
|--------------------------------------------------------|---------|---------------------|---------|-------|
| token_no_stop                                          | 0.09012 | 0.01588             | 0.07    | 0.619 |
| 'lemma', 'pos',  'snowball_stemmer',  'porter_stemmer' | 0.22044 | 0.01657             | 0.07    | 0.624 |
| 'next_lemma', 'next_pos'                               | 0.18115 | 0.00143             | 0.08    | 0.334 |
| 'prev_lemma', 'prev_pos'                               | 0.07587 | 0.05707             | 0.10    | 0.330 |
| 'head', 'dependency'                                   | 0.06720 | 0.09112 | 0.01    | 0.471 |
| is_part_of_negation                                    | 0.10008 | 0.00707             | 0.01    | 0.330 |
| 'has_prefix', 'has_postfix', 'has_infix'               | 0.12211 | 0.01605             | 0.01    | 0.330 |
| base_in_dictionary                                     | 0.21321 | 0.03591             | 0.01    | 0.330 |
| has_apostrophe                                         | 0.19722 | 0.06430             | 0.01    | 0.413 |

### Run hyperparameter optimization with "LO" Strategy

In [11]:
cms_LO, f1s_LO, predictions_LO, cs_LO = run_feature_analysis_with_hyperparameter_opt(strat='LO')

strategy:  LO
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    6.9s finished


best params: {'c1': 0.10629141526906866, 'c2': 0.004882483980495497}
model size: 0.26
['lemma', 'pos', 'prev_lemma', 'next_lemma', 'prev_pos', 'next_pos', 'snowball_stemmer', 'porter_stemmer', 'head', 'dependency', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix', 'base_in_dictionary', 'has_apostrophe']  has f1 score of  0.8907622420440887
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.0s finished


best params: {'c1': 0.09693077917222, 'c2': 0.021310041725049415}
model size: 0.20
['token_no_stop', 'prev_lemma', 'next_lemma', 'prev_pos', 'next_pos', 'head', 'dependency', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix', 'base_in_dictionary', 'has_apostrophe']  has f1 score of  0.8765855031667839
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    6.8s finished


best params: {'c1': 0.01767342977147185, 'c2': 0.03150171511240191}
model size: 0.56
['token_no_stop', 'lemma', 'pos', 'prev_lemma', 'prev_pos', 'snowball_stemmer', 'porter_stemmer', 'head', 'dependency', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix', 'base_in_dictionary', 'has_apostrophe']  has f1 score of  0.879189325071386
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.1s finished


best params: {'c1': 0.026096475001385, 'c2': 0.008929679274420236}
model size: 0.44
['token_no_stop', 'lemma', 'pos', 'next_lemma', 'next_pos', 'snowball_stemmer', 'porter_stemmer', 'head', 'dependency', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix', 'base_in_dictionary', 'has_apostrophe']  has f1 score of  0.8871951247556629
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    6.3s finished


best params: {'c1': 0.2761557267990402, 'c2': 0.019147574702611345}
model size: 0.10
['token_no_stop', 'lemma', 'pos', 'prev_lemma', 'next_lemma', 'prev_pos', 'next_pos', 'snowball_stemmer', 'porter_stemmer', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix', 'base_in_dictionary', 'has_apostrophe']  has f1 score of  0.891428737925351
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    6.9s finished


best params: {'c1': 0.06719845261256949, 'c2': 0.1098833466494944}
model size: 0.41
['token_no_stop', 'lemma', 'pos', 'prev_lemma', 'next_lemma', 'prev_pos', 'next_pos', 'snowball_stemmer', 'porter_stemmer', 'head', 'dependency', 'has_prefix', 'has_postfix', 'has_infix', 'base_in_dictionary', 'has_apostrophe']  has f1 score of  0.7867766019296178
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    6.6s finished


best params: {'c1': 0.22285700447746984, 'c2': 0.027019835149894367}
model size: 0.18
['token_no_stop', 'lemma', 'pos', 'prev_lemma', 'next_lemma', 'prev_pos', 'next_pos', 'snowball_stemmer', 'porter_stemmer', 'head', 'dependency', 'is_part_of_negation', 'base_in_dictionary', 'has_apostrophe']  has f1 score of  0.8326155308289542
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.3s finished


best params: {'c1': 0.0036793617187366925, 'c2': 0.02844097047558209}
model size: 1.23
['token_no_stop', 'lemma', 'pos', 'prev_lemma', 'next_lemma', 'prev_pos', 'next_pos', 'snowball_stemmer', 'porter_stemmer', 'head', 'dependency', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix', 'has_apostrophe']  has f1 score of  0.8880172033594285
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.3s finished


best params: {'c1': 0.3449365012412357, 'c2': 0.015932502958773038}
model size: 0.16
['token_no_stop', 'lemma', 'pos', 'prev_lemma', 'next_lemma', 'prev_pos', 'next_pos', 'snowball_stemmer', 'porter_stemmer', 'head', 'dependency', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix', 'base_in_dictionary']  has f1 score of  0.8769798061804136


| Features not included in the model                     | C1                   | C2                    | Size(M) | F1                 |
|--------------------------------------------------------|----------------------|-----------------------|---------|--------------------|
| 'token_no_stop'                                        | 0.22613  | 0.00798  | 0.16    | 0.842 |
| 'lemma', 'pos',  'snowball_stemmer',  'porter_stemmer' | 0.09043  | 0.03497   | 0.22    | 0.876 |
| 'next_lemma', 'next_pos'                               | 0.16699  | 0.00223 | 0.18    | 0.833 |
| 'prev_lemma', 'prev_pos'                               | 0.11191  | 0.01359   | 0.24    | 0.842 |
| 'head', 'dependency'                                   | 0.06136  | 0.01762  | 0.18    | 0.892 |
| is_part_of_negation                                    | 0.14129  | 0.07172   | 0.25    | 0.886 |
| 'has_prefix', 'has_postfix', 'has_infix'               | 0.01528 | 0.01286  | 0.98    | 0.877 |
| base_in_dictionary                                     | 0.06510  | 0.01988  | 0.34    | 0.841 |
| has_apostrophe                                         | 0.12878   | 0.02928  | 0.28    | 0.884 |

### Run hyperparameter optimization with "STACK" Strategy

In [12]:
cms_STACK, f1s_stack, predictions_STACK, cs_STACK = run_feature_analysis_with_hyperparameter_opt(strat='STACK')

strategy:  STACK
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.8s finished


best params: {'c1': 0.03775966062279422, 'c2': 0.024600777882316918}
model size: 0.15
['token_no_stop']  has f1 score of  0.6227473710985912
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.3s finished


best params: {'c1': 0.4241944136350176, 'c2': 0.02114690928104838}
model size: 0.06
['snowball_stemmer', 'lemma', 'pos', 'token_no_stop', 'porter_stemmer']  has f1 score of  0.6151032627440159
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.9s finished


best params: {'c1': 0.3812483132432386, 'c2': 0.0073486851653399715}
model size: 0.08
['snowball_stemmer', 'lemma', 'pos', 'token_no_stop', 'next_lemma', 'porter_stemmer', 'next_pos']  has f1 score of  0.6182201469748015
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.0s finished


best params: {'c1': 0.02680142176429649, 'c2': 0.0024798000699164185}
model size: 0.34
['snowball_stemmer', 'lemma', 'pos', 'prev_pos', 'token_no_stop', 'next_lemma', 'prev_lemma', 'porter_stemmer', 'next_pos']  has f1 score of  0.8922979615359795
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.7s finished


best params: {'c1': 0.2630212925158757, 'c2': 0.06316123880880954}
model size: 0.17
['head', 'snowball_stemmer', 'prev_pos', 'pos', 'lemma', 'token_no_stop', 'next_lemma', 'prev_lemma', 'dependency', 'porter_stemmer', 'next_pos']  has f1 score of  0.8353605726855364
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    5.8s finished


best params: {'c1': 0.052511398443426616, 'c2': 0.04323722077709664}
model size: 0.45
['head', 'snowball_stemmer', 'lemma', 'pos', 'prev_pos', 'token_no_stop', 'next_lemma', 'prev_lemma', 'dependency', 'porter_stemmer', 'is_part_of_negation', 'next_pos']  has f1 score of  0.8373482973797586
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.2s finished


best params: {'c1': 0.1239701630588222, 'c2': 0.024392593021445412}
model size: 0.24
['head', 'snowball_stemmer', 'prev_pos', 'pos', 'lemma', 'has_prefix', 'token_no_stop', 'has_postfix', 'next_lemma', 'prev_lemma', 'has_infix', 'dependency', 'porter_stemmer', 'is_part_of_negation', 'next_pos']  has f1 score of  0.8879088133443673
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.0s finished


best params: {'c1': 0.15739114392585127, 'c2': 0.05187534233578513}
model size: 0.25
['head', 'snowball_stemmer', 'base_in_dictionary', 'lemma', 'pos', 'prev_pos', 'has_postfix', 'token_no_stop', 'next_lemma', 'prev_lemma', 'dependency', 'has_infix', 'has_prefix', 'porter_stemmer', 'is_part_of_negation', 'next_pos']  has f1 score of  0.8787723785166243
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.4s finished


best params: {'c1': 0.11334909343289894, 'c2': 0.0008092100402595712}
model size: 0.31
['head', 'has_apostrophe', 'snowball_stemmer', 'base_in_dictionary', 'lemma', 'pos', 'prev_pos', 'has_postfix', 'token_no_stop', 'has_prefix', 'next_lemma', 'prev_lemma', 'has_infix', 'dependency', 'porter_stemmer', 'is_part_of_negation', 'next_pos']  has f1 score of  0.8803067556997263


| Features                                                          | C1                   | C2                   | Size(M) | F1                 |
|-------------------------------------------------------------------|----------------------|----------------------|---------|--------------------|
| 'token_no_stop'                                                   | 0.01641  | 0.020921  | 0.18    | 0.619 |
| Previous + 'lemma', 'pos',  'snowball_stemmer',  'porter_stemmer' | 0.16917  | 0.05348  | 0.08    | 0.619 |
| Previous + 'next_lemma', 'next_pos'                               | 0.01340 | 0.01784 | 0.56    | 0.890 |
| Previous + 'prev_lemma', 'prev_pos'                               | 0.03303 | 0.01359  | 0.28    | 0.892 |
| Previous + 'head', 'dependency'                                   | 0.0299   | 0.01027 | 0.68    | 0.610 |
| Previous + is_part_of_negation                                    | 0.02503  | 0.00552 | 0.70    | 0.838   |
| Previous + 'has_prefix', 'has_postfix', 'has_infix'               | 0.12343   | 0.02271 | 0.24    | 0.841 |
| Previous + base_in_dictionary                                     | 0.32099  | 0.02118 | 0.16    | 0.881 |
| Previous + has_apostrophe                                         | 0.27738   | 0.02182 | 0.28    | 0.875   |

### Extra not executed code

In [None]:
# f = sns.heatmap(cms_LO[0], annot=True, fmt='d', xticklabels=labels, yticklabels=labels)

In [None]:
feature_groups = [['token_no_stop'],['lemma','pos','snowball_stemmer','orter_stemmer'],['next_lemma','next_pos'], 
    ['prev_lemma', 'prev_pos'], ['head','dependency'], ['is_part_of_negation'], ['has_prefix','has_postfix','has_infix'],
    ['base_in_dictionary'],['has_apostrophe']]

prev_max_value = 0

max_value = max(f1s_only)
max_index = f1s_only.index(max_value)
base_group = feature_groups[max_index]
print("Basegroup: ", base_group)
print("F1 score: ", max_value)

while max_value > prev_max_value:
    prev_max_value = max_value
    cms_STACK, f1s_STACK, predictions_STACK = run_feature_analysis(strat='STACK', base_group= base_group)
    max_value = max(f1s_STACK)
    max_index = f1s_STACK.index(max_value)
    base_group = base_group + feature_groups[max_index]
    print("Basegroup: ", base_group)
    print("F1 score: ", max_value)

## Execution for BioCorpus

Adapt the functions to the new Corpus as it does not have the head, dependency pair

In [8]:
file_path =  '/Users/lauraalvarez/Documents/GitHub/ATM-OFC/ATM/data/BIO-CORPUS/biocorpus_training_data_with_features.csv'
train = pd.read_csv(file_path, sep=",", header=0, dtype={"annotator": "string", "sentence_id": int, "token_id":int})
train


file_path_test =  '/Users/lauraalvarez/Documents/GitHub/ATM-OFC/ATM/data/BIO-CORPUS/biocorpus_validation_data_with_features.csv'
dev = pd.read_csv(file_path_test, sep=",", header=0)

train.head(5)

Unnamed: 0,annotator,sentence_id,token_id,token,label,token_lower,token_no_punct,token_no_stop,lemma,pos,...,next_pos,snowball_stemmer,porter_stemmer,is_part_of_negation,has_prefix,has_postfix,has_infix,base,base_in_dictionary,has_apostrophe
0,8051172,2,16,involve,O,involve,involve,involve,involve,VERB,...,NOUN,involv,involv,0,True,False,False,volve,False,False
1,99634785,3,7,pain.,O,pain.,pain,pain,pain,NOUN,...,ADJ,pain.,pain.,0,False,False,False,pain,True,False
2,92107919,6,9,shift,O,shift,shift,shift,shift,VERB,...,DET,shift,shift,0,False,False,False,shift,True,False
3,7645208,7,25,infect,O,infect,infect,infect,infect,VERB,...,ADJ,infect,infect,0,True,False,False,fect,False,False
4,91237803,10,22,gs,O,gs,gs,gs,gs,PRON,...,NOUN,gs,gs,0,False,False,False,gs,False,False


### Utility Functions

In [9]:
def run_feature_analysis_with_hyperparameter_opt_extra(strat = 'LO', base_group = []):
    """strat = 'LO' if you want to leave a certain group out of the analysis
    strat = 'ONLY' if you want to train solely on that list of features 
    strat = 'STACK' if you want to increase the base_group 
    default is 'LO' """
    
    print('strategy: ', strat)
    
    feature_groups = [['lemma','pos','snowball_stemmer','porter_stemmer','is_part_of_negation', 'has_prefix','has_postfix','has_infix']]

    cms = []
    f1s = []
    css = []
    predictions_list = []

    for i, group in enumerate(feature_groups):
        X_train, y_train, cols = transform_to_crf(train,  strategy = strat, feature_group=group, base_group= base_group)
        X_test, y_test, cols = transform_to_crf(dev, strategy = strat, feature_group=group, base_group = base_group)
        crf, predictions = hyperparameter_search_crf(X_train, y_train, X_test)
        cm, f1 = get_metrics(predictions, y_test, crf)
        cms.append(cm)
        f1s.append(f1)
        css.append(crf.best_params_)
        predictions_list.append(predictions)
        if strat == "STACK": base_group = cols
        print(cols, ' has f1 score of ', f1)
        
    return cms, f1s, predictions_list, css, crf

### Run hyperparameter optimization with "ONLY" Strategy

In [None]:
# perform feature ablagation study
cms_only1, f1s_only1, predictions_list_only1, cs_ONLY1, model = run_feature_analysis_with_hyperparameter_opt_extra(strat='ONLY')

In [15]:
model

RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True, c1=0.1, c2=0.1,
                                 keep_tempfiles=None, max_iterations=200),
                   n_iter=20, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa478712130>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa472147400>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['O', 'B-NEG', 'I-NEG']),
                   verbose=1)

In [19]:
# function for creating the CRF classifier and predict the labels
def get_predictions(X_test, model):

    predictions = model.predict(X_test)
    
    return predictions

### Create CSV with predictions

In [None]:
group = ['lemma','pos','snowball_stemmer','porter_stemmer','is_part_of_negation', 'has_prefix','has_postfix','has_infix']
X_dev, y_dev, dev_cols = transform_to_crf(dev, strategy = "ONLY", feature_group=group, base_group = [])

preds = get_predictions(X_dev, model)

# Save development predictions to csv
for i in range(len(X_dev)):
    for j in range(len(X_dev[i])):
        X_dev[i][j]["prediction"] = preds[i][j]


csv_file = "copia_predictions_crf_only_features.csv"
csv_columns = ['lemma','pos','snowball_stemmer','porter_stemmer','is_part_of_negation','has_prefix','has_postfix','has_infix', 'prediction']
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in X_dev:
            for d in data:
                writer.writerow(d)
except IOError:
    print("I/O error")

### Test model with data

In [79]:
file_path =  'SEM2012_test_cardboard_data_with_features.csv'
test_cardboard = pd.read_csv(file_path, sep=",", header=0, dtype={"annotator": "string", "sentence_id": int, "token_id":int})

file_path_test =  'SEM2012_test_circle_data_with_features.csv'
test_circle = pd.read_csv(file_path_test, sep=",", header=0)

group = ['lemma','pos','snowball_stemmer','porter_stemmer','is_part_of_negation', 'has_prefix','has_postfix','has_infix']
X_test_cardboard, y_test_cardboard, cols_cardboard = transform_to_crf(test_cardboard, strategy = "ONLY", feature_group=group, base_group = [])
x_test_circle, y_test_circle, cols_circle = transform_to_crf(test_circle, strategy = "ONLY", feature_group=group, base_group = [])


preds = get_predictions(x_test_circle, model)
results_circle = get_metrics(preds, y_test_circle, model)
print('circle', results_circle)

preds = get_predictions(X_test_cardboard, model)
results_cardboard = get_metrics(preds, y_test_cardboard, model)
print('cardboard', results_circle)

['lemma', 'pos', 'snowball_stemmer', 'porter_stemmer', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix']
['lemma', 'pos', 'snowball_stemmer', 'porter_stemmer', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix']
['lemma', 'pos', 'snowball_stemmer', 'porter_stemmer', 'is_part_of_negation', 'has_prefix', 'has_postfix', 'has_infix']


In [81]:
X_dev[0][0]

{'lemma': '1',
 'pos': 'X',
 'snowball_stemmer': '1.',
 'porter_stemmer': '1.',
 'is_part_of_negation': 0,
 'has_prefix': False,
 'has_postfix': False,
 'has_infix': False}

In [82]:
preds[0][0]

'O'

In [84]:
X_dev

[[{'lemma': '1',
   'pos': 'X',
   'snowball_stemmer': '1.',
   'porter_stemmer': '1.',
   'is_part_of_negation': 0,
   'has_prefix': False,
   'has_postfix': False,
   'has_infix': False,
   'prediction': 'O'},
  {'lemma': 'the',
   'pos': 'DET',
   'snowball_stemmer': 'the',
   'porter_stemmer': 'the',
   'is_part_of_negation': 0,
   'has_prefix': False,
   'has_postfix': False,
   'has_infix': False,
   'prediction': 'O'},
  {'lemma': 'singular',
   'pos': 'VERB',
   'snowball_stemmer': 'singular',
   'porter_stemmer': 'singular',
   'is_part_of_negation': 0,
   'has_prefix': False,
   'has_postfix': False,
   'has_infix': False,
   'prediction': 'O'},
  {'lemma': 'experience',
   'pos': 'NOUN',
   'snowball_stemmer': 'experi',
   'porter_stemmer': 'experi',
   'is_part_of_negation': 0,
   'has_prefix': False,
   'has_postfix': False,
   'has_infix': False,
   'prediction': 'O'},
  {'lemma': 'mr',
   'pos': 'PROPN',
   'snowball_stemmer': 'mr.',
   'porter_stemmer': 'mr.',
   'is_pa

In [61]:
X_test[0]

[{'lemma': 'choose',
  'pos': 'VERB',
  'snowball_stemmer': 'choos',
  'porter_stemmer': 'choos',
  'is_part_of_negation': 0,
  'has_prefix': False,
  'has_postfix': False,
  'has_infix': False,
  'prediction': 'O'},
 {'lemma': 'typical',
  'pos': 'ADJ',
  'snowball_stemmer': 'typic',
  'porter_stemmer': 'typic',
  'is_part_of_negation': 0,
  'has_prefix': False,
  'has_postfix': False,
  'has_infix': False,
  'prediction': 'O'},
 {'lemma': 'case',
  'pos': 'NOUN',
  'snowball_stemmer': 'case',
  'porter_stemmer': 'case',
  'is_part_of_negation': 0,
  'has_prefix': False,
  'has_postfix': False,
  'has_infix': False,
  'prediction': 'O'},
 {'lemma': 'illustrate',
  'pos': 'VERB',
  'snowball_stemmer': 'illustr',
  'porter_stemmer': 'illustr',
  'is_part_of_negation': 0,
  'has_prefix': True,
  'has_postfix': False,
  'has_infix': False,
  'prediction': 'O'},
 {'lemma': 'the',
  'pos': 'DET',
  'snowball_stemmer': 'the',
  'porter_stemmer': 'the',
  'is_part_of_negation': 0,
  'has_pref

best params: {'c1': 0.2385755741822089, 'c2': 0.0013081978849273242}
model size: 0.04


(array([[4702,   10,    0],
        [   4,   90,    0],
        [   3,    1,    0]]),
 0.64042414790743)

| Features                                               | C1      | C2      | Size(M) | F1      |
|--------------------------------------------------------|---------|---------|---------|---------|
| 'token_no_stop'                                        | 0.19007 | 0.00249 | 0.05    | 0.64365  |
| 'lemma', 'pos',  'snowball_stemmer',  'porter_stemmer' | 0.23494 | 0.15420 | 0.04    | 0.74844  |
| 'next_lemma', 'next_pos'                               | 0.24204 | 0.06845 | 0.08    | 0.49077 |
| 'prev_lemma', 'prev_pos'                               | 0.05233 | 0.05366 | 0.19    | 0.42104 |
| is_part_of_negation                                    | 0.21192 | 0.03721 | 0.01    | 0.46313 |
| 'has_prefix', 'has_postfix', 'has_infix'               | 0.31910 | 0.02809 | 0.01    | 0.33215 |
| base_in_dictionary                                     | 0.01493 | 0.01912 | 0.01    | 0.33216 |
| has_apostrophe                                         | 2.62169 | 0.00084 | 0.01    | 0.33214 |

### Run hyperparameter optimization with "LO" Strategy

In [None]:
cms_LO1, f1s_LO1, predictions_LO1 = run_feature_analysis_with_hyperparameter_opt_extra(strat='LO')

| Features not included                                  | C1      | C2       | Size(M) | F1                 |
|--------------------------------------------------------|---------|----------|---------|--------------------|
| 'token_no_stop'                                        | 0.33616 | 0.01467  | 0.06    | 0.882 |
| 'lemma', 'pos',  'snowball_stemmer',  'porter_stemmer' | 0.20129 | 0.00753  | 0.09    | 0.879 |
| 'next_lemma', 'next_pos'                               | 0.44736 | 0.08421  | 0.04    | 0.922 |
| 'prev_lemma', 'prev_pos'                               | 0.36862 | 0.020305 | 0.04    | 0.929 |
| is_part_of_negation                                    | 0.25601 | 0.03625  | 0.07    | 0.907 |
| 'has_prefix', 'has_postfix', 'has_infix'               | 0.24918 | 0.00747  | 0.06    | 0.883 |
| base_in_dictionary                                     | 0.33464 | 0.01944  | 0.06    | 0.882 |
| has_apostrophe                                         | 0.19956 | 0.00127  | 0.08    | 0.882 |

### Run hyperparameter optimization with "STACK" Strategy

In [None]:
cms_STACK1, f1s_STACK1, predictions_STACK1 = run_feature_analysis_with_hyperparameter_opt_extra(strat='STACK')

| Features                                                          | C1      | C2      | Size(M) | F1    |
|-------------------------------------------------------------------|---------|---------|---------|-------|
| 'token_no_stop'                                                   | 0.01641 | 0.10274 | 0.46    | 0.64  |
| Previous + 'lemma', 'pos',  'snowball_stemmer',  'porter_stemmer' | 0.28866 | 0.01594 | 0.03    | 0.751 |
| Previous + 'next_lemma', 'next_pos'                               | 0.00508 | 0.02211 | 0.75    | 0.76  |
| Previous + 'prev_lemma', 'prev_pos'                               | 0.28087 | 0.05389 | 0.06    | 0.908 |
| Previous + is_part_of_negation                                    | 0.25790 | 0.01331 | 0.06    | 0.883 |
| Previous + 'has_prefix', 'has_postfix', 'has_infix'               | 0.41458 | 0.01186 | 0.05    | 0.907 |
| Previous + base_in_dictionary                                     | 0.74933 | 0.02342 | 0.04    | 0.883 |
| Previous + has_apostrophe                                         | 0.75811 | 0.01813 | 0.04    | 0.883 |