In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multioutput import MultiOutputClassifier




In [2]:
file_train = 'en_ewt-up-train.conllu'
file_test = 'en_ewt-up-dev.conllu'
file_dev = 'en_ewt-up-test.conllu'

In [3]:
def read_file(file):
    '''
    This function reads the file, pre-processes it, turns it into a pd dataframe and creates .csv
    
    :param file: the filepath 
    :type file: string
    
    :return: dataframe 
    '''
    # read file line by line
    with open(file, "r") as infile:
        content = infile.readlines()
    
    # delete lines that start with # 
    content = [x for x in content if not x.startswith('#')]
    
    # delete empty lines 
    content = [x for x in content if not x.startswith('\n')]
    
    # create dataframe by separating on tab spaces
    df = pd.DataFrame([x.split('\t') for x in content])
    
    # create headers
    headers1 = ['ID','TOKEN','LEMMA','POS-UNIV','POS','MORPH','HEAD','BASIC DEP','ENHANCED DEP','SPACE','PREDICATE']
    total_columns = len(df.columns)
    headers2 = [*range(0, total_columns-11, 1)]
    headers2 = ['LABELS P' + str(x) for x in headers2]
    headers_complete = headers1 + headers2
    
    # add headers to df
    df = df.set_axis(headers_complete, axis=1)
    
    # create csv file
    outputfilename = file.replace('.conllu', '.csv')
    outputfile = df.to_csv(outputfilename, sep=',')
    
    return df

In [4]:
df_train = read_file(file_train)
df_dev = read_file(file_dev)
df_test = read_file(file_test)

df_train.head(10)

Unnamed: 0,ID,TOKEN,LEMMA,POS-UNIV,POS,MORPH,HEAD,BASIC DEP,ENHANCED DEP,SPACE,...,LABELS P25,LABELS P26,LABELS P27,LABELS P28,LABELS P29,LABELS P30,LABELS P31,LABELS P32,LABELS P33,LABELS P34
0,1,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,...,,,,,,,,,,
1,2,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,...,,,,,,,,,,
2,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,...,,,,,,,,,,
3,4,:,:,PUNCT,:,_,1,punct,1:punct,_,...,,,,,,,,,,
4,5,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,...,,,,,,,,,,
5,6,forces,force,NOUN,NNS,Number=Plur,7,nsubj,7:nsubj,_,...,,,,,,,,,,
6,7,killed,kill,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,parataxis,1:parataxis,_,...,,,,,,,,,,
7,8,Shaikh,Shaikh,PROPN,NNP,Number=Sing,7,obj,7:obj,_,...,,,,,,,,,,
8,9,Abdullah,Abdullah,PROPN,NNP,Number=Sing,8,flat,8:flat,_,...,,,,,,,,,,
9,10,al,al,PROPN,NNP,Number=Sing,8,flat,8:flat,SpaceAfter=No,...,,,,,,,,,,


In [5]:
def merge_labels(df):
    '''
    This function merges the label columns into one gold column with the labels as lists
    :param df: the dataframe 
    :type df: pandas dataframe
    
    :return: updated dataframe
    '''
    
    total_columns = len(df.columns)
    df['GOLD LABELS'] = df.iloc[:,11:total_columns].values.tolist()
    df = df.drop(df.iloc[:,11:total_columns],axis = 1)
    
    return df

In [6]:
df_train = merge_labels(df_train)
df_test = merge_labels(df_test)
df_dev = merge_labels(df_dev)

In [7]:
df_dev.head()

Unnamed: 0,ID,TOKEN,LEMMA,POS-UNIV,POS,MORPH,HEAD,BASIC DEP,ENHANCED DEP,SPACE,PREDICATE,GOLD LABELS
0,1,What,what,PRON,WP,PronType=Int,0,root,0:root,_,_,"[_\n, None, None, None, None, None, None, None..."
1,2,if,if,SCONJ,IN,_,4,mark,4:mark,_,_,"[_\n, None, None, None, None, None, None, None..."
2,3,Google,Google,PROPN,NNP,Number=Sing,4,nsubj,4:nsubj,_,_,"[ARG1\n, None, None, None, None, None, None, N..."
3,4,Morphed,morph,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,advcl,1:advcl:if,_,morph.01,"[V\n, None, None, None, None, None, None, None..."
4,5,Into,into,ADP,IN,_,6,case,6:case,_,_,"[_\n, None, None, None, None, None, None, None..."


In [8]:
def clean_goldlabels(labels):
    '''
    This function deletes irrelevant characters from the gold labels 
    :param labels: the gold labels 
    :type labels: list
    
    :return: cleaned labels
    '''                
            
    labels = ['-' if l is None or l=='_\n' or l=='_' else l for l in labels]
    
    pattern = r"\n$"
    labels = [re.sub(pattern, '', s) for s in labels]
    
    labels = np.array(labels)
    
    return labels

In [9]:
df_train['GOLD LABELS'] = df_train['GOLD LABELS'].apply(clean_goldlabels)
df_test['GOLD LABELS'] = df_test['GOLD LABELS'].apply(clean_goldlabels)
df_dev['GOLD LABELS'] = df_dev['GOLD LABELS'].apply(clean_goldlabels)

In [10]:
df_train.head(10)

Unnamed: 0,ID,TOKEN,LEMMA,POS-UNIV,POS,MORPH,HEAD,BASIC DEP,ENHANCED DEP,SPACE,PREDICATE,GOLD LABELS
0,1,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,_,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
1,2,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,_,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
2,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,_,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
3,4,:,:,PUNCT,:,_,1,punct,1:punct,_,_,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
4,5,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,_,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
5,6,forces,force,NOUN,NNS,Number=Plur,7,nsubj,7:nsubj,_,_,"[ARG0, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
6,7,killed,kill,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,parataxis,1:parataxis,_,kill.01,"[V, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
7,8,Shaikh,Shaikh,PROPN,NNP,Number=Sing,7,obj,7:obj,_,_,"[ARG1, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
8,9,Abdullah,Abdullah,PROPN,NNP,Number=Sing,8,flat,8:flat,_,_,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."
9,10,al,al,PROPN,NNP,Number=Sing,8,flat,8:flat,SpaceAfter=No,_,"[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -, ..."


In [11]:
def additional_features(df, file):
    '''
    This function extracts features additional features, adds them to the dataframe 
    
    :param df:
    :type df: pandas dataframe
    :return: df with additional feature columns
    '''
    
    # previous part-of-speech tag
    prev_pos = df['POS'].shift()
    df.insert(5, "PREV POS", prev_pos)
    
    # next part-of-speech tag
    next_pos = df['POS'].shift(-1)
    df.insert(6, "NEXT POS", next_pos)
    
    # create csv file
    outputfilename = file.replace('.conllu', '.addfeatures.csv')
    outputfile = df.to_csv(outputfilename, sep=',')
    
    return df

In [12]:
df_train = additional_features(df_train, 'en_ewt-up-train.conllu')
df_test = additional_features(df_test, 'en_ewt-up-test.conllu')
df_dev = additional_features(df_dev, 'en_ewt-up-dev.conllu')

In [13]:
file_add_train = 'en_ewt-up-train.addfeatures.csv'
file_add_test = 'en_ewt-up-test.addfeatures.csv'
file_add_dev = 'en_ewt-up-dev.addfeatures.csv'

In [14]:
### code used from the course Machine Learning for NLP 

def extract_features_and_labels(train_file):
    '''
    This function extracts the features and labels from the training and development dataset 
    
    :param inputfile: filepath to either training dataset or development dataset
    :type inputfile: a string with the filepath 
    
    :returns: a dict of the features and a list of the labels 
    '''
    
    data = []
    targets = []

    for line in open(train_file, encoding='utf-8'):
        content = line.split(',')
        if len(content) > 10:
            token = content[2]
            lemma = content[3]
            pos_univ = content[4]
            pos = content[5]
            prev_pos = content[6]
            next_pos = content[7]
            morph = content[8]
            head = content[9]
            basic_dep = content[10]
            enh_dep = content[11]
            predicate = content[13]

            # create dict
            feature_dict = {'token': token,
                            'lemma': lemma,
                            'pos_univ': pos_univ,
                            'pos': pos,
                            'prev_pos': prev_pos,
                            'next_pos': next_pos, 
                            'morph': morph, 
                            'head': head,
                            'basic_dep': basic_dep,
                            'enh_dep': enh_dep,
                            'predicate': predicate}

            gold_label = list(content[14:])

            data.append(feature_dict)
            targets.append(gold_label)
            
    mlb = MultiLabelBinarizer()
    targets = mlb.fit_transform(targets)
    
        
    return data, targets

In [15]:
def extract_features(test_file):
    '''
    This function extracts the features from the test dataset 
    
    :param inputfile: filepath to either training dataset or development dataset
    :type inputfile: a string with the filepath 
    
    :returns: a dict of the features and a list of the labels 
    '''

    data = []

    for line in open(test_file, encoding='utf-8'):
        content = line.split(',')
        if len(content) > 10:

            # define columns
            token = content[2]
            lemma = content[3]
            pos_univ = content[4]
            pos = content[5]
            prev_pos = content[6]
            next_pos = content[7]
            morph = content[8]
            head = content[9]
            basic_dep = content[10]
            enh_dep = content[11]
            predicate = content[13]    

            # create dict
            feature_dict = {'token': token,
                            'lemma': lemma,
                            'pos_univ': pos_univ,
                            'pos': pos,
                            'prev_pos': prev_pos,
                            'next_pos': next_pos, 
                            'morph': morph, 
                            'head': head,
                            'basic_dep': basic_dep,
                            'enh_dep': enh_dep,
                            'predicate': predicate}

            data.append(feature_dict)
        
    return data

In [16]:
def extract_predictions(file):
    '''
    This function extracts the predictions from the last column of the given file 
    :param file: filepath 
    :type file: string 
    
    :return: list of predictions 
    '''
    
    predictions = []
    for line in open(file):

        content = line.split(',')

        predictions.append(content[-1].rstrip('\n'))
    return predictions
     

In [17]:
def classify_data(model, vec, input_file, outputfile):
    
    '''
    This function 
    
    :param model: the model to run 
    :type model: string
    :param vec: gold labels 
    :type vec: 
    '''
    
    features = extract_features(input_file)
    vec_features = vec.transform(features)
    predictions = model.predict(vec_features)

    predictions[0] = "prediction"

    outfile = open(outputfile, 'w')
    counter = 0
                           
    for line in open(input_file):
        outfile.write(line.rstrip('\n') + ',' + predictions[counter] + '\n')
        counter += 1
                           
    outfile.close()
    
def run_machine_learning_models(train_file, test_file, outputfile, model):
    
    ''' 
    This function runs the model by making use of the functions create_classifier and classify_data 
    :param train_file: filepath to the training dataset
    :type train_file: string
    :param test_file: filepath to the test dataset
    :type test_file: string
    :param outputfile: filepath where the output will be created 
    :type outputfile: string
    :param model: model to run 
    :type model: string
    '''
    
    features, gold_labels = extract_features_and_labels(train_file)
    print(len(features), len(gold_labels))

    print("Loading...", model)
    ml_model, vec = create_classifier(features, gold_labels, model)
    classify_data(ml_model, vec, test_file, outputfile)
    print("Method", model, "is done!")

    return gold_labels

def create_classifier(features, targets, modelname):
    '''
    Function that takes feature-value pairs and gold labels as input and trains a logistic regression classifier
    
    :param features: feature-value pairs
    :type features: a list of dictionaries
    :param targets: gold labels
    :type targets: a list of strings
    :param modelname: ml modelname to execute
    :type modelname: string
    :return model: a trained classifier
    :return vec: a DictVectorizer to which the feature values are fitted. 
    '''
  
    if modelname == "logreg":
        model = LogisticRegression(max_iter=1000)
  
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(features)
    model = MultiOutputClassifier(model).fit(features_vectorized, targets)

    return model, vec

In [None]:
gold_labels = run_machine_learning_models(file_add_train, file_add_test, 'predictions_logreg.csv', 'logreg')

204610 204610
Loading... logreg


In [None]:
y_pred_logreg = extract_predictions('predictions_logreg.csv')