In [1]:
import pandas as pd
import numpy as np
from data_utils import *
from feature_utils import *
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [2]:
train_path = '../Data/en_ewt-up-train.conllu'
dev_path = '../Data/en_ewt-up-dev.conllu'
test_path = '../Data/en_ewt-up-test.conllu'

In [3]:
train_data = prepare_data(train_path)
dev_data = prepare_data(dev_path)
test_data = prepare_data(test_path)

In [4]:
train_data.shape, dev_data.shape, test_data.shape

((1011069, 37), (103886, 37), (100431, 37))

In [5]:
# run the following code to see if the conll transform works
# select a sample sentence from train data
'''
train_data = read_conll(train_path)
train_slice = train_data[train_data['sent_id'] == 'weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002']
train_ex = conll_transform(train_slice)
#train_ex.to_csv('train_ex.csv', sep='\t', index=False)
'''

"\ntrain_data = read_conll(train_path)\ntrain_slice = train_data[train_data['sent_id'] == 'weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002']\ntrain_ex = conll_transform(train_slice)\n#train_ex.to_csv('train_ex.csv', sep='\t', index=False)\n"

In [6]:
train_data.head()

Unnamed: 0,sent_id,token_id,token,lemma,POS,Universal_POS,morph_type,distance_head,dep_label,dep_rel,...,pron_type,person,distance_to_predicate,is_before_predicate,is_auxiliary,is_main_verb,path_to_predicate,is_token_predicate,is_token_argument,argument_label
0,answers-20090605110235AAALlCt_ans-0001,1,Where,where,ADV,WRB,[PronType=Int],0,root,0:root,...,4,0,2,1,0,0,"[Where, is]",0,1,ARG2
1,answers-20090605110235AAALlCt_ans-0001,2,in,in,ADP,IN,[_],4,case,4:case,...,0,0,2,1,0,0,"[in, Where, is]",0,0,O
2,answers-20090605110235AAALlCt_ans-0001,3,the,the,DET,DT,"[Definite=Def, PronType=Art]",4,det,4:det,...,2,0,1,1,0,0,"[the, world, in, Where, is]",0,0,O
3,answers-20090605110235AAALlCt_ans-0001,4,world,world,NOUN,NN,[Number=Sing],1,obl,1:obl:in,...,0,0,1,1,0,0,"[world, in, Where, is]",0,0,O
4,answers-20090605110235AAALlCt_ans-0001,5,is,be,AUX,VBZ,"[Mood=Ind, Number=Sing, Person=3, Tense=Pres, ...",1,cop,1:cop,...,0,3,1,0,1,0,[is],1,0,O


In [7]:
train_data['argument_label'].unique()

array(['ARG2', 'O', 'ARG1', 'ARGM-MNR', 'ARGM-ADV', 'ARGM-LOC',
       'ARGM-MOD', 'ARGM-ADJ', 'ARG0', 'ARGM-DIR', 'ARGM-DIS', 'ARGM-CAU',
       'ARGM-TMP', 'ARGM-PRD', 'ARGM-GOL', 'ARGM-EXT', 'ARGM-NEG',
       'ARGM-CXN', 'ARG3', 'ARG4', 'ARGM-PRP', 'ARGM-PRR', 'ARGM-LVB',
       'ARGM-COM', 'ARG5', 'ARGA', 'ARGM-REC', 'ARG1-DSP'], dtype=object)

In [8]:
train_data.columns

Index(['sent_id', 'token_id', 'token', 'lemma', 'POS', 'Universal_POS',
       'morph_type', 'distance_head', 'dep_label', 'dep_rel', 'space',
       'predicate', 'argument_type', 'token_bigram', 'POS_bigram',
       'token_trigram', 'POS_trigram', 'ner', 'definite_ind', 'number_plur',
       'gender_fem', 'case_nom', 'tense_pres', 'mood_ind', 'verbform',
       'voice_passive', 'possesive', 'pron_type', 'person',
       'distance_to_predicate', 'is_before_predicate', 'is_auxiliary',
       'is_main_verb', 'path_to_predicate', 'is_token_predicate',
       'is_token_argument', 'argument_label'],
      dtype='object')

In [9]:
target = ['is_token_argument', 'argument_label']

numeric_features = ['definite_ind', 'number_plur', 'gender_fem', 'case_nom', 'tense_pres',
                     'mood_ind', 'verbform', 'voice_passive', 'possesive', 'pron_type', 'person',
                     'is_before_predicate', 'distance_to_predicate', 'is_auxiliary', 'is_main_verb']

text_features = ['token', 'lemma', 'POS', 'Universal_POS', 'morph_type', 'dep_label', 'dep_rel',
                  'space', 'predicate', 'ner', 'token_bigram', 'token_trigram', 'POS_bigram', 'POS_trigram',
                  'path_to_predicate']

count_vectorizer = create_count_vectorizer(train_data, text_features)
X_train = process_data(train_data, count_vectorizer, numeric_features)
X_test = process_data(test_data, count_vectorizer, numeric_features)

y_train_is_token_argument = train_data['is_token_argument'].astype(int).values.reshape(-1, 1)
y_test_is_token_argument = test_data['is_token_argument'].astype(int).values.reshape(-1, 1) 

unique_labels = np.sort(train_data['argument_label'].unique())
encoder = OneHotEncoder(categories=[unique_labels], handle_unknown='ignore')
y_train_argument_label_sparse = encoder.fit_transform(train_data[['argument_label']])
y_test_argument_label_sparse = encoder.transform(test_data[['argument_label']])

y_train_argument_label = y_train_argument_label_sparse.toarray()
y_test_argument_label = y_test_argument_label_sparse.toarray()

y_train = np.hstack([y_train_is_token_argument, y_train_argument_label])
y_test = np.hstack([y_test_is_token_argument, y_test_argument_label])

print("Shape of X_train:", X_train.shape)
print("Shape of y_train_encoded_array:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1011069, 244708)
Shape of y_train_encoded_array: (1011069, 29)
Shape of X_test: (100431, 244708)
Shape of y_test: (100431, 29)


In [10]:
model = OneVsRestClassifier(LogisticRegression(max_iter=5000, random_state=0, multi_class='ovr', n_jobs=-1))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
# base_lr = LogisticRegression(max_iter=1000, random_state=0, n_jobs=6)
# model = MultiOutputClassifier(base_lr, n_jobs=-1)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

In [12]:
arg_id_metrics_df, arg_class_metrics_df = calculate_metrics(y_test, y_pred, encoder)

In [13]:
print("Argument Identification Task:")
arg_id_metrics_df

Argument Identification Task:


Unnamed: 0,Metric,Score
0,Precision,0.39547
1,Recall,0.076586
2,F1 Score,0.128321


In [14]:
print("Argument Classification Task:")
arg_class_metrics_df

Argument Classification Task:


Unnamed: 0,Label,Precision,Recall,F1 Score
0,ARG0,0.398649,0.035351,0.064942
1,ARG1,0.361644,0.041852,0.075021
2,ARG1-DSP,0.0,0.0,0.0
3,ARG2,0.254237,0.013599,0.025818
4,ARG3,0.0,0.0,0.0
5,ARG4,0.0,0.0,0.0
6,ARG5,0.0,0.0,0.0
7,ARGA,0.0,0.0,0.0
8,ARGM-ADJ,0.681818,0.069124,0.125523
9,ARGM-ADV,0.12766,0.012552,0.022857
