In [1]:
# IMPORT STATEMENTS
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# if the function is zipped, can use the below libraries to unzip them
import gzip
import shutil
import xml.etree.ElementTree as XET

# these files are from PolEval2017 and contains data fro the task (supervised learning models)
training_file = '..\\data\\training.xml'
validation_file = '..\\data\\validate.xml'
testing_file = '..\\data\\test.xml'

# feature_extraction_from_xml(data_file, orth_or_lemma, ctag_or_full)
This function is used to extract the orth/lemma and the ctag/full-ctag from the data_file based on the specifications in the second and third arguments
Returns all the orths/lemmas, ctags/full_ctags, and the number of tokens per chunk (sentence).

Must pass 'orth' or 'lemma' for argument 2
Must pass 'ctag' or 'full' for argument 3

In [2]:
def feature_extraction(data_file, orth_or_lemma, ctag_or_full):
    # if (orth_or_lemma != 'orth') or (orth_or_lemma != 'lemma'):
    #     print("Pass the right parameters to orth_or_lemma")
    #     return
    # if (ctag_or_full != 'ctag') or (ctag_or_full != 'full'):
    #     print("Pass the right parameters to ctag_or_full")
    #     return


    orths = []
    ctags = []
    tokens_in_chunk = []

    tree = XET.parse(data_file)
    root = tree.getroot()

    # highest level tag in the xml (one chunk is one sentence)
    for chunk in root.findall('chunk'):
        curr_chunk = 0

        orths.extend(["_", "_", "_"])
        ctags.extend(["NONE", "NONE", "NONE"])
        for token in chunk.findall('tok'):
            if orth_or_lemma == 'orth':
                orth = token.find('orth').text
            elif orth_or_lemma == 'lemma':
                orth = token.find('lex').find('base').text

            if ctag_or_full == 'ctag':
                ctag = token.find('lex').find('ctag').text.split(':', 1)[0]
            elif ctag_or_full == 'full':
                ctag = token.find('lex').find('ctag').text
            curr_chunk += 1

            orths.append(orth)
            ctags.append(ctag)
        orths.extend(["_", "_", "_"])
        ctags.extend(["NONE", "NONE", "NONE"])
        tokens_in_chunk.append(curr_chunk)
    return orths, ctags, tokens_in_chunk

In [12]:
def generate_dataframe(orths_or_lemmas, ctags_or_fulls, in_chunks):
    a1, a2, a3, a4, a5, a6, a7, tags = [], [], [], [], [], [], [], []
    curr_orth = 3 # keeps track of which orth we are currently in

    # total number of orths/lemmas to iterate through
    orths_count = len(orths_or_lemmas)

    for i in range(len(in_chunks)):
        for j in range(curr_orth, (curr_orth + in_chunks[i])):
            a1.append(orths_or_lemmas[j-3])
            a2.append(orths_or_lemmas[j-2])
            a3.append(orths_or_lemmas[j-1])
            a4.append(orths_or_lemmas[j])
            a5.append(orths_or_lemmas[j+1])
            a6.append(orths_or_lemmas[j+2])
            a7.append(orths_or_lemmas[j+3])
            tags.append(ctags_or_fulls[j])
            curr_orth += 1
        curr_orth += 6

    total_tag_size = len(tags)
    df = pd.DataFrame(list(zip(a1, a2, a3, a4, a5, a6, a7, tags)), columns=['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'class'])

    return df

Uses sklearns preprocessor for label encoding
df is for the dataframe

In [4]:
def labels_and_forFitting(df):
    labels = np.asarray(df['class'].astype("category").cat.codes.tolist())
    values = df.drop(columns=['class']).values # represents just the features
    le.fit(values.ravel()) # assigns numeric codes for unique values in values

    to_return = le.transform(values.ravel()) # convert features to numeric codes
    to_return = to_return.reshape(values.shape[0], -1) # reshape back to old shape

    return labels, to_return

In [5]:
def buildClassifiers(clf, X_train, X_test, y_train, y_test):
    # The fit function trains the model (clf)
    # X_train is a 2d array of the features: each row represents a datapoint, each column represents a feature
    # y_train is a 1d array of labels. The nth value of the array is the label for the nth row in X_train
    clf.fit(X_train, y_train)
    
    # generating predictions for unseen data
    y_pred = clf.predict(X_test)

    # calculate the precision, recall and f1 scores to evaluate the classifiers performance on the test data
    f1 = f1_score(y_test, y_pred, average="micro")
    precision = precision_score(y_test, y_pred, average="micro")
    recall = recall_score(y_test, y_pred, average="micro")

    return f1, precision, recall

The following block is used to see if the models work as they should. Training is used to train the model, which is then validated using the validation set

In [13]:
# all data for training
orths_train, ctags_train, in_chunks_train = feature_extraction(training_file, 'orth', 'ctag')
df_train = generate_dataframe(orths_train, ctags_train, in_chunks_train)
labels_train, trained_vals = labels_and_forFitting(df_train)

# all data for validation
orths_validate, ctags_validate, in_chunks_validate = feature_extraction(validation_file, 'orth', 'ctag')
df_validate = generate_dataframe(orths_validate, ctags_validate, in_chunks_validate)
labels_validate, validated = labels_and_forFitting(df_validate)

df_total = pd.concat([df_train, df_validate], axis=0)
labels_total = np.asarray(df_total['class'].astype("category").cat.codes.tolist())
X_vals_total = df_total.drop(columns=['class']).values # modify this line to drop multiple columns (always drop class and one more - like a1, a3)

le.fit(X_vals_total.ravel())
X_total = le.transform(X_vals_total.ravel())
X_total = X_total.reshape(X_vals_total.shape[0], -1)

# CONTAINS THE TOTAL TRAINING SIZE (train + validate) TO BE USED AS TRAINING FOR THE FINAL TEST SET
total_train_size = len(df_train)
total_validate_size = len(df_validate)
# total_final_train = len(df_total)

In [14]:
names = ['Naive_Bayes', 'Decision_Tree']
classifiers = [GaussianNB(), 
               DecisionTreeClassifier(random_state=0)]
for name, clf in zip(names, classifiers):
    print('Now classifying', name)
    aList, bList, cList = list(), list(), list()

    
    X_train, X_test = X_total[0:total_train_size], X_total[total_train_size:total_train_size + total_validate_size]
    Y_train, Y_test = labels_total[0:total_train_size], labels_total[total_train_size:total_train_size + total_validate_size]

    f1, precision, recall = buildClassifiers(clf, X_train, X_test, Y_train, Y_test)
    aList.append(f1)
    bList.append(precision)
    cList.append(recall)

    print("\tAverage F1 for {}:\t\t".format(name), np.mean(aList))
    print("\tAverage Precision for {}:\t".format(name), np.mean(bList))
    print("\tAverage Recall for {}:\t\t".format(name), np.mean(cList))

Now classifying Naive_Bayes
	Average F1 for Naive_Bayes:		 0.40082198503251143
	Average Precision for Naive_Bayes:	 0.4008219850325114
	Average Recall for Naive_Bayes:		 0.4008219850325114
Now classifying Decision_Tree
	Average F1 for Decision_Tree:		 0.8246636404531141
	Average Precision for Decision_Tree:	 0.8246636404531141
	Average Recall for Decision_Tree:		 0.8246636404531141
