In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [27]:
def read_file(file):
    '''
    This function reads the file, pre-processes it, turns it into a pd dataframe and creates .csv
    
    :param file: the filepath 
    :type file: string
    '''
    # read file line by line
    with open(file, "r") as infile:
        content = infile.readlines()
    
    # delete lines that start with # 
    content = [x for x in content if not x.startswith('#')]
    
    # delete empty lines 
    content = [x for x in content if not x.startswith('\n')]
    
    # create dataframe by separating on tab spaces
    df = pd.DataFrame([x.split('\t') for x in content])
    
    # create headers
    headers1 = ['ID','TOKEN','LEMMA','POS-UNIV','POS','MORPH','HEAD','BASIC DEP','ENHANCED DEP','SPACE','PREDICATE']
    total_columns = len(df.columns)
    headers2 = [*range(0, total_columns-11, 1)]
    headers2 = ['LABELS P' + str(x) for x in headers2]
    headers_complete = headers1 + headers2
    
    # add headers to df
    df = df.set_axis(headers_complete, axis=1)
    
    # create csv file
    outputfilename = file.replace('.conllu', '.csv')
    outputfile = df.to_csv(outputfilename, sep=',')
    
    return df

In [29]:
df_train = read_file('en_ewt-up-train.conllu')
df_dev = read_file('en_ewt-up-dev.conllu')
df_test = read_file('en_ewt-up-test.conllu')

df_train.head(10)

Unnamed: 0,ID,TOKEN,LEMMA,POS-UNIV,POS,MORPH,HEAD,BASIC DEP,ENHANCED DEP,SPACE,...,LABELS P25,LABELS P26,LABELS P27,LABELS P28,LABELS P29,LABELS P30,LABELS P31,LABELS P32,LABELS P33,LABELS P34
0,1,Al,Al,PROPN,NNP,Number=Sing,0,root,0:root,SpaceAfter=No,...,,,,,,,,,,
1,2,-,-,PUNCT,HYPH,_,1,punct,1:punct,SpaceAfter=No,...,,,,,,,,,,
2,3,Zaman,Zaman,PROPN,NNP,Number=Sing,1,flat,1:flat,_,...,,,,,,,,,,
3,4,:,:,PUNCT,:,_,1,punct,1:punct,_,...,,,,,,,,,,
4,5,American,american,ADJ,JJ,Degree=Pos,6,amod,6:amod,_,...,,,,,,,,,,
5,6,forces,force,NOUN,NNS,Number=Plur,7,nsubj,7:nsubj,_,...,,,,,,,,,,
6,7,killed,kill,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,parataxis,1:parataxis,_,...,,,,,,,,,,
7,8,Shaikh,Shaikh,PROPN,NNP,Number=Sing,7,obj,7:obj,_,...,,,,,,,,,,
8,9,Abdullah,Abdullah,PROPN,NNP,Number=Sing,8,flat,8:flat,_,...,,,,,,,,,,
9,10,al,al,PROPN,NNP,Number=Sing,8,flat,8:flat,SpaceAfter=No,...,,,,,,,,,,


In [30]:
df_test.head()

Unnamed: 0,ID,TOKEN,LEMMA,POS-UNIV,POS,MORPH,HEAD,BASIC DEP,ENHANCED DEP,SPACE,...,LABELS P8,LABELS P9,LABELS P10,LABELS P11,LABELS P12,LABELS P13,LABELS P14,LABELS P15,LABELS P16,LABELS P17
0,1,What,what,PRON,WP,PronType=Int,0,root,0:root,_,...,,,,,,,,,,
1,2,if,if,SCONJ,IN,_,4,mark,4:mark,_,...,,,,,,,,,,
2,3,Google,Google,PROPN,NNP,Number=Sing,4,nsubj,4:nsubj,_,...,,,,,,,,,,
3,4,Morphed,morph,VERB,VBD,Mood=Ind|Tense=Past|VerbForm=Fin,1,advcl,1:advcl:if,_,...,,,,,,,,,,
4,5,Into,into,ADP,IN,_,6,case,6:case,_,...,,,,,,,,,,


In [31]:
df_dev.head()

Unnamed: 0,ID,TOKEN,LEMMA,POS-UNIV,POS,MORPH,HEAD,BASIC DEP,ENHANCED DEP,SPACE,...,LABELS P8,LABELS P9,LABELS P10,LABELS P11,LABELS P12,LABELS P13,LABELS P14,LABELS P15,LABELS P16,LABELS P17
0,1,From,from,ADP,IN,_,3,case,3:case,_,...,,,,,,,,,,
1,2,the,the,DET,DT,Definite=Def|PronType=Art,3,det,3:det,_,...,,,,,,,,,,
2,3,AP,AP,PROPN,NNP,Number=Sing,4,obl,4:obl:from,_,...,,,,,,,,,,
3,4,comes,come,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,0,root,0:root,_,...,,,,,,,,,,
4,5,this,this,DET,DT,Number=Sing|PronType=Dem,6,det,6:det,_,...,,,,,,,,,,


In [32]:
def additional_features(df, file):
    '''
    This function extracts features additional features, adds them to the dataframe 
    
    :param df:
    :type df: pandas dataframe
    '''
    
    # previous part-of-speech tag
    prev_pos = df['POS'].shift()
    df.insert(5, "PREV POS", prev_pos)
    
    # next part-of-speech tag
    next_pos = df['POS'].shift(-1)
    df.insert(6, "NEXT POS", next_pos)
    
    # create csv file
    outputfilename = file.replace('.conllu', '.addfeatures.csv')
    outputfile = df.to_csv(outputfilename, sep=',')
    
    return df

additional_features(df_train, 'en_ewt-up-train.conllu')
df_train.head()

Unnamed: 0,ID,TOKEN,LEMMA,POS-UNIV,POS,PREV POS,NEXT POS,MORPH,HEAD,BASIC DEP,...,LABELS P25,LABELS P26,LABELS P27,LABELS P28,LABELS P29,LABELS P30,LABELS P31,LABELS P32,LABELS P33,LABELS P34
0,1,Al,Al,PROPN,NNP,,HYPH,Number=Sing,0,root,...,,,,,,,,,,
1,2,-,-,PUNCT,HYPH,NNP,NNP,_,1,punct,...,,,,,,,,,,
2,3,Zaman,Zaman,PROPN,NNP,HYPH,:,Number=Sing,1,flat,...,,,,,,,,,,
3,4,:,:,PUNCT,:,NNP,JJ,_,1,punct,...,,,,,,,,,,
4,5,American,american,ADJ,JJ,:,NNS,Degree=Pos,6,amod,...,,,,,,,,,,


In [24]:
#Only return the columns with a substring in the column name
def return_columns(df, substring):
    '''
    This function returns the columns with a substring in the column name
    
    :param df:
    :type df: pandas dataframe
    
    :param substring:
    :type substring: string
    '''
    return df.loc[:, df.columns.str.contains(substring)]

labels = return_columns(df_train, 'LABELS')
labels.head()

Unnamed: 0,LABELS P0,LABELS P1,LABELS P2,LABELS P3,LABELS P4,LABELS P5,LABELS P6,LABELS P7,LABELS P8,LABELS P9,...,LABELS P25,LABELS P26,LABELS P27,LABELS P28,LABELS P29,LABELS P30,LABELS P31,LABELS P32,LABELS P33,LABELS P34
0,_\n,,,,,,,,,,...,,,,,,,,,,
1,_\n,,,,,,,,,,...,,,,,,,,,,
2,_\n,,,,,,,,,,...,,,,,,,,,,
3,_\n,,,,,,,,,,...,,,,,,,,,,
4,_\n,,,,,,,,,,...,,,,,,,,,,


In [34]:
#Return the columns without a substring in the column name
def return_columns_without(df, substring):
    '''
    This function returns the columns without a substring in the column name
    
    :param df:
    :type df: pandas dataframe
    
    :param substring:
    :type substring: string
    '''
    return df.loc[:, ~df.columns.str.contains(substring)]

features = return_columns_without(df_train, 'LABELS')

In [41]:
from sklearn.preprocessing import LabelEncoder

# assume X_train and X_test are numpy arrays with string features
encoder = LabelEncoder()
encoder.fit(features)
X_train_encoded = encoder.transform(features.values)

#Create logistic regression model and fit it to the data
def create_model(features, labels):
    '''
    This function creates a logistic regression model and fits it to the data
    
    :param df:
    :type df: pandas dataframe
    
    :param labels:
    :type labels: pandas dataframe
    '''
    #Create logistic regression model
    logreg = LogisticRegression()


    #Fit the model to the data
    logreg.fit(features, labels.values)
    
    return logreg

#Train the model
logreg = create_model(X_train_encoded, labels)

#Predict the labels for the dev set
predictions = logreg.predict(df_dev)

#Create a dataframe with the predictions
df_predictions = pd.DataFrame(predictions)

#Create a dataframe with the correct labels
df_correct_labels = return_columns(df_dev, 'LABELS')

#Calculate the accuracy
accuracy = (df_predictions == df_correct_labels).mean().mean()
print(accuracy)

