In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [3]:
def read_file(file):
    '''
    This function reads the file, pre-processes it, turns it into a pd dataframe and creates .csv
    
    :param file: the filepath 
    :type file: string
    '''
    # read file line by line
    with open(file, "r") as infile:
        content = infile.readlines()
    
    # delete lines that start with # 
    content = [x for x in content if not x.startswith('#')]
    
    # delete empty lines 
    content = [x for x in content if not x.startswith('\n')]
    
    # create dataframe by separating on tab spaces
    df = pd.DataFrame([x.split('\t') for x in content])
    
    # create headers
    headers1 = ['ID','TOKEN','LEMMA','POS-UNIV','POS','MORPH','HEAD','BASIC DEP','ENHANCED DEP','SPACE','PREDICATE']
    total_columns = len(df.columns)
    headers2 = [*range(0, total_columns-11, 1)]
    headers2 = ['LABELS P' + str(x) for x in headers2]
    headers_complete = headers1 + headers2
    
    # add headers to df
    df = df.set_axis(headers_complete, axis=1)
    
    # create csv file
    outputfilename = file.replace('.conllu', '.csv')
    outputfile = df.to_csv(outputfilename, sep=',')
    
    return df

In [4]:
df_train = read_file('en_ewt-up-train.conllu')
df_dev = read_file('en_ewt-up-dev.conllu')
df_test = read_file('en_ewt-up-test.conllu')

df_train.head(10)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6770: character maps to <undefined>

In [None]:
df_test.head()

In [None]:
df_dev.head()

In [5]:
def additional_features(df, file):
    '''
    This function extracts features additional features, adds them to the dataframe 
    
    :param df:
    :type df: pandas dataframe
    '''
    
    # previous part-of-speech tag
    prev_pos = df['POS'].shift()
    df.insert(5, "PREV POS", prev_pos)
    
    # next part-of-speech tag
    next_pos = df['POS'].shift(-1)
    df.insert(6, "NEXT POS", next_pos)
    
    # create csv file
    outputfilename = file.replace('.conllu', '.addfeatures.csv')
    outputfile = df.to_csv(outputfilename, sep=',')
    
    return df

additional_features(df_train, 'en_ewt-up-train.conllu')
df_train.head()

NameError: name 'df_train' is not defined

In [6]:
#Only return the columns with a substring in the column name
def return_columns(df, substring):
    '''
    This function returns the columns with a substring in the column name
    
    :param df:
    :type df: pandas dataframe
    
    :param substring:
    :type substring: string
    '''
    return df.loc[:, df.columns.str.contains(substring)]

labels = return_columns(df_train, 'LABELS')
labels.head()

NameError: name 'df_train' is not defined

In [34]:
#Return the columns without a substring in the column name
def return_columns_without(df, substring):
    '''
    This function returns the columns without a substring in the column name
    
    :param df:
    :type df: pandas dataframe
    
    :param substring:
    :type substring: string
    '''
    return df.loc[:, ~df.columns.str.contains(substring)]

features = return_columns_without(df_train, 'LABELS')

In [1]:

# assume X_train and X_test are numpy arrays with string features
encoder = LabelEncoder()
encoder.fit(features)
X_train_encoded = encoder.transform(features.values)

#Create logistic regression model and fit it to the data
def create_model(features, labels):
    '''
    This function creates a logistic regression model and fits it to the data
    
    :param df:
    :type df: pandas dataframe
    
    :param labels:
    :type labels: pandas dataframe
    '''
    #Create logistic regression model
    logreg = LogisticRegression()


    #Fit the model to the data
    logreg.fit(features, labels.values)
    
    return logreg

#Train the model
logreg = create_model(X_train_encoded, labels)

#Predict the labels for the dev set
predictions = logreg.predict(df_dev)

#Create a dataframe with the predictions
df_predictions = pd.DataFrame(predictions)

#Create a dataframe with the correct labels
df_correct_labels = return_columns(df_dev, 'LABELS')

#Calculate the accuracy
accuracy = (df_predictions == df_correct_labels).mean().mean()
print(accuracy)



NameError: name 'features' is not defined