In [1]:
import pandas as pd
import numpy as np
from data_utils import *
from feature_utils import *
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train_path = '../Data/en_ewt-up-train.conllu'
dev_path = '../Data/en_ewt-up-dev.conllu'
test_path = '../Data/en_ewt-up-test.conllu'

In [3]:
train_data = prepare_data(train_path)
dev_data = prepare_data(dev_path)
test_data = prepare_data(test_path)

In [None]:
train_data.shape, dev_data.shape, test_data.shape

((1011069, 32), (103886, 32), (100431, 32))

In [5]:
# run the following code to see if the conll transform works
# select a sample sentence from train data
'''
train_data = read_conll(train_path)
train_slice = train_data[train_data['sent_id'] == 'weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002']
train_ex = conll_transform(train_slice)
#train_ex.to_csv('train_ex.csv', sep='\t', index=False)
'''

"\ntrain_data = read_conll(train_path)\ntrain_slice = train_data[train_data['sent_id'] == 'weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002']\ntrain_ex = conll_transform(train_slice)\n#train_ex.to_csv('train_ex.csv', sep='\t', index=False)\n"

In [4]:
train_data.head()

Unnamed: 0,sent_id,token_id,token,lemma,POS,Universal_POS,morph_type,distance_head,dep_label,dep_rel,...,token_bigram,POS_bigram,token_trigram,POS_trigram,ner,distance_to_predicate,is_before_predicate,is_token_predicate,is_token_argument,argument_label
0,answers-20090605110235AAALlCt_ans-0001,1,Where,where,ADV,WRB,PronType=Int,0,root,0:root,...,_ Where,_ ADV,_ _ Where,_ _ ADV,O,2,1,0,1,ARG2
1,answers-20090605110235AAALlCt_ans-0001,2,in,in,ADP,IN,_,4,case,4:case,...,Where in,ADV ADP,_ Where in,_ ADV ADP,O,2,1,0,0,O
2,answers-20090605110235AAALlCt_ans-0001,3,the,the,DET,DT,Definite=Def|PronType=Art,4,det,4:det,...,in the,ADP DET,Where in the,ADV ADP DET,O,1,1,0,0,O
3,answers-20090605110235AAALlCt_ans-0001,4,world,world,NOUN,NN,Number=Sing,1,obl,1:obl:in,...,the world,DET NOUN,in the world,ADP DET NOUN,O,1,1,0,0,O
4,answers-20090605110235AAALlCt_ans-0001,5,is,be,AUX,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,1,cop,1:cop,...,world is,NOUN AUX,the world is,DET NOUN AUX,O,1,0,1,0,O


In [7]:
train_data['argument_label'].unique()

array(['ARG2', 'O', 'ARG1', 'ARGM-MNR', 'ARGM-ADV', 'ARGM-LOC',
       'ARGM-MOD', 'ARGM-ADJ', 'ARG0', 'ARGM-DIR', 'ARGM-DIS', 'ARGM-CAU',
       'ARGM-TMP', 'ARGM-PRD', 'ARGM-GOL', 'ARGM-EXT', 'ARGM-NEG',
       'ARGM-CXN', 'ARG3', 'ARG4', 'ARGM-PRP', 'ARGM-PRR', 'ARGM-LVB',
       'ARGM-COM', 'ARG5', 'ARGA', 'ARGM-REC', 'ARG1-DSP'], dtype=object)

In [8]:
target = ['is_token_argument', 'argument_label']

numeric_features = ['definite_ind', 'number_plur', 'gender_fem', 'case_nom', 'tense_pres',
                     'mood_ind', 'verbform', 'voice_passive', 'possesive', 'pron_type', 'person']

text_features = ['token', 'lemma', 'POS', 'Universal_POS', 'morph_type', 'dep_label', 'dep_rel',
                  'space', 'predicate', 'ner', 'token_bigram', 'token_trigram', 'POS_bigram', 'POS_trigram']

count_vectorizer = create_count_vectorizer(train_data, text_features)
X_train = process_data(train_data, count_vectorizer, numeric_features)
X_test = process_data(test_data, count_vectorizer, numeric_features)

encoder = OneHotEncoder()
y_train_encoded = encoder.fit_transform(train_data[target])
y_dev_encoded = encoder.transform(test_data[target])
y_train = y_train_encoded.toarray()
y_test = y_dev_encoded.toarray()

print("Shape of X_train:", X_train.shape)
print("Shape of y_train_encoded_array:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_dev_encoded_array:", y_test.shape)

Shape of X_train: (1011069, 227226)
Shape of y_train_encoded_array: (1011069, 30)
Shape of X_test: (100431, 227226)
Shape of y_dev_encoded_array: (100431, 30)


In [9]:
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=0))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
weighted_avg_precision_identification, weighted_avg_recall_identification, weighted_avg_f1_identification,\
    macro_avg_precision_identification, macro_avg_recall_identification, macro_avg_f1_identification = calculate_metrics(0, 2, y_test, y_pred)
print("For Arugment Identification:")
print(f"Weighted Average Precision: {weighted_avg_precision_identification:.4f}")
print(f"Weighted Average Recall: {weighted_avg_recall_identification:.4f}")
print(f"Weighted Average F1-Score: {weighted_avg_f1_identification:.4f}\n")
print(f"Macro Average Precision: {macro_avg_precision_identification:.4f}")
print(f"Macro Average Recall: {macro_avg_recall_identification:.4f}")
print(f"Macro Average F1-Score: {macro_avg_f1_identification:.4f}")

print("---"*10)

weighted_avg_precision_classification, weighted_avg_recall_classification, weighted_avg_f1_classification,\
      macro_avg_precision_classification, macro_avg_recall_classification, macro_avg_f1_classification = calculate_metrics(3, 30, y_test, y_pred)
print("\nFor Argument Classification:")
print(f"Weighted Average Precision: {weighted_avg_precision_classification:.4f}")
print(f"Weighted Average Recall: {weighted_avg_recall_classification:.4f}")
print(f"Weighted Average F1-Score: {weighted_avg_f1_classification:.4f}\n")
print(f"Macro Average Precision: {macro_avg_precision_classification:.4f}")
print(f"Macro Average Recall: {macro_avg_recall_classification:.4f}")
print(f"Macro Average F1-Score: {macro_avg_f1_classification:.4f}")

For Arugment Identification:
Weighted Average Precision: 0.8935
Weighted Average Recall: 0.9155
Weighted Average F1-Score: 0.8896

Macro Average Precision: 0.7637
Macro Average Recall: 0.5606
Macro Average F1-Score: 0.5843
------------------------------

For Argument Classification:
Weighted Average Precision: 0.9920
Weighted Average Recall: 0.9942
Weighted Average F1-Score: 0.9921

Macro Average Precision: 0.5929
Macro Average Recall: 0.5205
Macro Average F1-Score: 0.5269
