In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np
import sys

In [2]:
def main():
    # Read tsv file using pandas and turn it into a dataframe
    # Read in dev / test set depeding on argument provided when running the python file
    
    #train_df 
    train_df = pd.read_csv('../data/intermediate/production_test_05_identifiedArguments.csv') 
    train_df = train_df.fillna('X')
    train_df[train_df['label_ident_prediction'] == True]
    
    #test_df
    test_df = pd.read_csv('../data/intermediate/production_test_05_identifiedArguments.csv')
    test_df = test_df.fillna('X')
    test_df[test_df['label_ident_prediction'] == True]
    
    train_instances = train_df[["sentenceId", "sentenceRepetition", "id", "form", "lemma", "upos", "xpos", "morph", "head", "dep", "head_dep", "space", "predicate_prediction", "label_ident_prediction", "passive", "full_constituent"]].to_dict('records')
    test_instances = test_df[["sentenceId", "sentenceRepetition", "id", "form", "lemma", "upos", "xpos", "morph", "head", "dep", "head_dep", "space", "predicate_prediction", "label_ident_prediction", "passive", "full_constituent"]].to_dict('records')
    
    vec = DictVectorizer()
    X_train = vec.fit_transform(train_instances)
    
    Y_train = train_df.label_gold.tolist()
    Y_test = test_df.label_gold.tolist()
    
    classifier = LinearSVC(max_iter = 10000)
    
    parameters = dict(C=(0.01, 0.1, 1.0), loss=('hinge', 'squared_hinge'), tol=(0.0001,0.001,0.01,0.1))
    
    grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=5, scoring='f1_macro')
    grid.fit(X_train, Y_train)
    classifier = grid.best_estimator_
    X_test = vec.transform(test_instances)
    predictions = classifier.predict(X_test)
    
    test_df['predictions'] = predictions
    test_df.to_csv('output_test_df.csv')
    report = pd.DataFrame(classification_report(y_true=test_df['label_gold'], y_pred=test_df['predictions'], output_dict=True)).transpose()
    print(report)

if __name__ == '__main__':
    main()





              precision    recall  f1-score      support
ARG0           0.877193  0.574713  0.694444    87.000000
ARG1           0.921053  0.526316  0.669856   133.000000
ARG2           0.956522  0.511628  0.666667    43.000000
ARG3           1.000000  0.600000  0.750000     5.000000
ARG4           1.000000  1.000000  1.000000     1.000000
ARGM-ADJ       0.000000  0.000000  0.000000    11.000000
ARGM-ADV       0.923077  0.428571  0.585366    28.000000
ARGM-CAU       1.000000  0.500000  0.666667     2.000000
ARGM-DIS       1.000000  0.857143  0.923077     7.000000
ARGM-GOL       1.000000  1.000000  1.000000     3.000000
ARGM-LOC       0.714286  0.500000  0.588235    10.000000
ARGM-LVB       1.000000  0.666667  0.800000     3.000000
ARGM-MNR       1.000000  0.666667  0.800000     6.000000
ARGM-MOD       0.800000  0.285714  0.421053    14.000000
ARGM-NEG       1.000000  0.666667  0.800000     9.000000
ARGM-PRD       1.000000  0.333333  0.500000     3.000000
ARGM-PRP       1.000000  1.0000

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
