# Preprocessing
This part loads the data and prepares it for vectorization and classification.

In [2]:
########## import all required libraries ############

# could also use the nltk one, I cannot download any package from there somehow
from stop_words import get_stop_words
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm
from stop_words import get_stop_words
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from os.path import join
import string
from unidecode import unidecode
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import random
from gensim.models import Word2Vec

Global variables are loaded here. Among these are a library of english stop words that might need to be filtered out, a string of specific punctuation symbols that need to be filtered out, a library of contractions that can be seperated in their full words. 
The list of the 5 possible labels is created.

In [3]:
########## Global Variables ###########

# Define the stop_words library as english
stop_words = get_stop_words('english')

# Define a string with all punctuations
punctuations = '''!()-[]{};:'"\,=<>./?@#$%^&*_~'''

# Define a library of contractions
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

# create a list of the possible lables
labels = ["background", "objective", "methods", "results", "conclusions"]


Different functions are created to tailor the cleaning of the text. Based on the endgoal, in this case, classifying a sentence from a scientific abstract as being from which header/part of the abstract, you might want to keep or get rid of specific words/wordparts. We have focussed specifically on scientific symbols and specific numbers that might occur more often in results or conclusion sections. Furthermore we included a function for stop-words, the WordNetLemmatizer, contractions, punctuations and accentuations.

In [4]:
########## Functions used for data cleaning. ###########

# This function replaces specific symbols that are important for scientific context in strings so they are not removed.
def special_symbol_replacer(sentence_list):
    lemmatizer = {
        '%': 'percentage',
        '>': 'larger',
        '<': 'smaller',
        '+': 'plus',
        '=': 'equals',
        'n': 'amount',
        '/': 'slash'}
    new_sentence = []
    for word in sentence_list:
        if word in lemmatizer:
            word = lemmatizer[word]
        new_sentence.append(word)
    return new_sentence

# Function that replaces contractions with the two seperate words.
def replace_contraction(list):
    new_sentence = []
    for word in list:
        if word in contraction_dict:
            new_word = contraction_dict[word]
            new_sentence.append(new_word)
        else:
            new_sentence.append(word)
    return new_sentence


# Function to handle numbers. Turns them into a string defining a specific category: 'integer', 'float', 'fraction'.
# It ignores any letter/number combination words
def handle_nums(sentence_list):
    sentence_list = list(filter(lambda word: len(word) != 0, sentence_list))
    output = []
    for word in sentence_list:
        if any(char.isdigit() for char in word):  # if there is a number in the word
            if any(char.isalpha() for char in word):   # if there is also a letter in the word, ignore.
                continue
            if '.' in word:
                output.append('float')
            elif '/' in word:
                output.append('fraction')
            else:
                output.append('integer')
        else:
            output.append(word)
    return output


# Function to handle dashes. Removes the dash and returns a word splitted by a dash in two words
def handle_dash(sentence_list):
    output = []
    for word in sentence_list:
        output += word.split('-')
    return output


# Function removes any single letter words from the text.
def remove_singles(sentence_list):
    return list(filter(lambda word: not(len(word) == 0 and word.isalpha()), sentence_list))


# Function to perform lemmatization on the text. The lemmatizer needs to be defined elsewhere
def lemmatizer(list):
    # Define the lemmatizer as the WordNetLemmatizer from NLTK
    my_lemmatizer = WordNetLemmatizer()
    output = []
    for word in list:
        new_word = my_lemmatizer.lemmatize(word)
        output.append(new_word)
    return output


# Function that removes all remaining punctuations
def remove_punctuation(list):
    output=[]
    for word in list:
        new_word = ""
        for letter in word:
            if letter not in string.punctuation:
                new_word += letter
        output.append(new_word)
    return output


# Function that removes all accentuated characters.
def remove_accented_chars(list):
    output=[]
    for word in list:
        output.append(unidecode(word))
    return output
        

# Function to return all the words from each sentence back into one single string.
def list_to_string(sentence):
    return ' '.join(word for word in sentence)



The proprocessing function is defined. It takes as an input a text document and returns two lists of length n, containing labels and the cleaned sentences respectively in corresponding order.
The cleaning functions are placed in specific order. Particular cleaning functions can be included or excluded. After testing the cleaning functions on the baseline classifier, it showed that no cleaning at all resulted in the best weighted F1 scores.
A full cleaning would reduce the amount of unique words in the training text by roughly 11%.

In [5]:
# Function that reads whole text files, selects and splits labels and sentences, and cleans the sentences.
def preprocess_text(text):
    output_labels = []  # define an empty list to store the labels
    output_sentences = []  # define an empty list to store the sentences

    for line in tqdm(text):
        lowers = line.lower()  # puts all letters in text in lowercase
        splitted = lowers.split()  # splits the sentence in a list of words

        # select only the relevant parts of the text
        if len(splitted) > 0:  # ignores all empty lines
            # ignores all sentences that do not start with a predifined label
            if splitted[0] not in labels:
                continue
            else:
                # split the sentence into its label and the sentence:
                label = splitted[0]
                labelnum = labels.index(label)
                word_list = splitted[1:]

                # Cleaning functions 
                word_list = replace_contraction(word_list)  # replaces contractions with full words
                word_list = handle_nums(word_list)  # handeles the numbers in the text
                word_list = special_symbol_replacer(word_list) # replaces symbol for text
                word_list = handle_dash(word_list)  # handles words with dashes
                word_list = remove_punctuation(word_list)   # removes punctuations
                word_list = remove_accented_chars(word_list)   # removes accentuation
                word_list = [word for word in word_list if not word in stop_words]
                word_list = remove_singles(word_list)  # removes single letter words
                word_list = [word for word in word_list if not len(word) == 0]  # removes empty strings
#                 word_list = lemmatizer(word_list)  # Performs lemmatization

                
            # Put the obtained labels and processed text in corresponding lists.
            output_labels.append(labelnum)
            output_sentences.append(list_to_string(word_list))
    return output_labels, output_sentences

Defines the get_data function to load the different data files and run the preprocessing function linewise.

In [6]:

# Define the path that stores the text files.
path = "./PubMed_200k_RCT"


# Returns [labels, sentences] pair. set type: 'test', 'dev' or 'train'
def get_data(set_type='test'):
    with open(join(path, f'{set_type}.txt'), "r") as f:
        data = f.readlines()
    return preprocess_text(data)


Load the data of the train, test and dev file and save their labels and cleaned sentences as labels anc corpus respectively.

# Word2Vec


## Embedding model - training  

In [7]:
# w2v hyperparameters: 
vec_size = 100
window = 7
epochs = 15

In [8]:
print('Preprocessing train data...', end=' ', flush=True)
labels_train, corpus_train = get_data('train')
labels_valid, corpus_valid = get_data('dev')
labels_test, corpus_test = get_data('test')
print('done.', flush=True)

Preprocessing train data... 

100%|██████████| 2593169/2593169 [04:02<00:00, 10697.29it/s]
100%|██████████| 33932/33932 [00:03<00:00, 10759.31it/s]
100%|██████████| 34493/34493 [00:03<00:00, 10944.38it/s]

done.





In [9]:
# print('Training w2v model...', end=' ', flush=True)
# model = Word2Vec(sentences=corpus_train, vector_size=vec_size,
#                  window=window, min_count=1, workers=4, epochs=epochs)
# model.save(
#     "trained_models/word2vec.model")
# print('done.', flush=True)
model = Word2Vec.load("trained_models/word2vec.model")

## Embedding model - evaluation

We chose the hyperparameters for the w2v model based on the following evaluation, which yielded the parameters above

this was the output from the best model: 

word2vec_100_7_15.model: 

((0.3767671252007464, 3.123816278482829e-12), SpearmanrResult(correlation=0.3488502741199042, pvalue=1.3738505633123715e-10), 9.34844192634561)

In [37]:
# take a look at the eval metrics from word2vec word pairs: 
word_sim = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
print('output from word pairs: ')
print(word_sim)

NameError: name 'datapath' is not defined

## Classification with word2vec embedding - preprocessing


We first need to preprocess the data by embedding the words and averaging them, filtering out the sentences that could not be embedded, due to being empty or only having words that are not in the dictionary

In [38]:
# map text to vectors and average each sentence to one vector:
all_vector_texts = []
all_labels = []
not_in_model = []
for text, labels in zip([corpus_train, corpus_valid, corpus_test], [labels_train, labels_valid,labels_test]):
    delete_labels = []
    vector_text = []
    for i,sentence in enumerate(text):
        # assert that sentence is not empty:
        if sentence == []:
            delete_labels.append(i)
            continue

        feature_vec = []
        for word in sentence:
            try:
                feature_vec.append(w2v_model.wv[word])
            except Exception:
                not_in_model.append(word)

        mean = np.array(feature_vec).mean(axis=0)
        # also get rid of nan means due to only unknown words (example: sentence with 1 word not in dict)
        if np.shape(mean) != (vec_size,):
            delete_labels.append(i)
            continue

        vector_text.append(mean)
    # delete labels:
    for i in sorted(delete_labels, reverse=True):
        del labels[i]

    all_vector_texts.append(vector_text)
    all_labels.append(labels)

train_vector_text = all_vector_texts[0]
val_vector_text = all_vector_texts[1]
test_vector_text = all_vector_texts[2]

train_labels = all_labels[0]
val_labels = all_labels[1]
test_labels = all_labels[2]

  mean = np.array(feature_vec).mean(axis=0)
  ret = ret.dtype.type(ret / rcount)


## Classification with word2vec embedding - Logistic Regression



Trying out logistic regression on top of the averaged embeddings. We did a grid search to find the parameters that worked best. (l2 penalty , C=5, class weigths = None, F1_weighted score for val and test respectively: 0.746, 0.749) Sadly we did not manage to yield better result here, than on the baseline. 

We also tried varying the word2vec models here, finding that vec_size=300 worked best here, although it performed worse on the word pairs task. 

In [40]:
log_reg = LogisticRegression(solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
log_reg.fit(np.array(train_vector_text), np.array(train_labels))


ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
# evaluate on val and test: 
def evaluate(model, X, y):
    y_pred = model.predict(X)
    micro = f1_score(y, y_pred, average='micro')
    macro = f1_score(y, y_pred, average='macro')
    weighted = f1_score(y, y_pred, average='weighted')
    # samples = f1_score(y, y_pred, average='samples')
    print(f'F1 Score: micro {micro}, macro {macro}, weighted {weighted}')

In [None]:
evaluate(log_reg, val_vector_text, val_labels)
evaluate(log_reg, test_vector_text, test_labels)

# save the model
model_name_logreg = 'log_reg.sav'
pickle.dump(log_reg, open(model_name_logreg, 'wb'))


In [None]:
# load the model and evaluate: 
# you can also load the model here and evaluate the results: 
loaded_model = pickle.load(open(model_name_logreg, 'rb'))
evaluate(loaded_model, val_vector_text, val_labels)
evaluate(loaded_model, test_vector_text, test_labels)

## Classification with word2vec embedding - SVC

We also tried out SVCs on top of the averaged embedding. They performed very promising, when training them on the validation set and evaluating on test, but we did not manage to train them on the full dataset due to computational and time limits. 
For training on validation, we also did a small hyperparameter search yielding the following parameters: 
kernel = 'rbf', C=2
F1_weighted score on test set: 0.77

In [None]:
svc = SVC(gamma='auto', random_state=0, C=2,verbose=True, kernel='rbf')
svc.fit(np.array(val_vector_text), np.array(val_labels))

In [None]:
# evaluate as above
evaluate(svc, test_vector_text, test_labels)

#save the model
model_name_svc = 'svc_val.sav'
pickle.dump(svc, open(model_name_svc, 'wb'))

In [None]:
# load the provided model and evaluate: 
# load the model and evaluate: 
# you can also load the model here and evaluate the results: 
loaded_model = pickle.load(open(model_name_svc, 'rb'))
evaluate(loaded_model, test_vector_text, test_labels)

## Classification with word2vec embedding - Fully Connected Neural Network

Our last attempt was to feed the averaged embeddings into a neural network. This seemed very promising, but the results did not look very well here. We suspect that further fine tuning of the model would have been needed here. 
The resulting F1_weighted score was 0.29. We would have wished to further investigate the issue here, which was not possible due to time constraints. 

In [None]:
# custom f1_weighted: 
def f1_weighted(label, pred):
    label = K.cast(K.flatten(label), 'int32')
    true = K.one_hot(label, num_classes)
    pred_labels = K.argmax(pred, axis=-1)
    pred = K.one_hot(pred_labels, num_classes)

    ground_positives = K.sum(true, axis=0) + K.epsilon()  # = TP + FN
    pred_positives = K.sum(pred, axis=0) + K.epsilon()  # = TP + FP
    true_positives = K.sum(true * pred, axis=0) + K.epsilon()  # = TP
    # all with shape (4,)

    precision = true_positives / pred_positives
    recall = true_positives / ground_positives

    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())

    weighted_f1 = f1 * ground_positives / K.sum(ground_positives)
    weighted_f1 = K.sum(weighted_f1)

    return weighted_f1


In [None]:
# defining the nn
fc_model = Sequential()
fc_model.add(Dense(64,input_dim=vec_size, activation='relu'))
fc_model.add(Dense(16, activation='relu'))
fc_model.add(Dense(5, activation='softmax'))
fc_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=[f1_weighted])

In [None]:
# training the nn with callbacks and checkpoints
model_name = 'fc_w2v_model'
timestr = time.strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f'./logs/{model_name}_{timestr}', update_freq='batch')
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath='fc_w2v_checkpoint', save_best_only=True, monitor='val_loss',
                                                save_format='tf')
fc_model.fit(np.array(train_vector_text).reshape(-1, 100), np.array(train_labels), epochs=50,
             validation_data=(np.array(val_vector_text).reshape(-1, 100), np.array(val_labels)),
             batch_size=64, callbacks=[tensorboard_callback, checkpoint])
fc_model.save(model_name, save_format='tf')


In [None]:
# load the trained model and evaluate: 
loaded_model = tf.keras.models.load_model(model_name)
# evaluate:
val_results = model.evaluate(np.array(val_vector_text).reshape(-1, 100), np.array(val_labels))
test_results = model.evaluate(np.array(test_vector_text).reshape(-1, 100), np.array(test_labels))

print(val_results)
print(test_results)

# LSTM training

### Import

In [10]:
import keras.backend as K
from gensim.models import Word2Vec
import numpy as np
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.utils import Sequence
from tensorflow.keras import backend as K

Removing sentences shorter than 10 words.

In [11]:
def clead_data(labels, corpus):
    for i, s in enumerate(corpus):
        if len(s) < 10:
            corpus.pop(i)
            labels.pop(i)
    assert(len(labels) == len(corpus))
    return labels, corpus

### Model
The model takes a strings as the input and outputs integer coressponding to the predicted classes. It consists of an encoder as the first layer, which converts strings into integer lists, word2vec embedding as the second one, one LSTM layer, one densely connected with ReLu activation and softmax classifier at the end. The embedding layes is initialized using the W matrix from a pretrained gensim model.

In [24]:
def create_LSTM_model():
    # Decoders transforms sentences into lists of integers.
    encoder = TextVectorization(max_tokens=model.wv.vectors.shape[0])
    encoder.adapt(corpus_valid)
    
    LSTM_model = Sequential()
    LSTM_model.add(encoder)
    LSTM_model.add(Embedding(input_dim=model.wv.vectors.shape[0], output_dim=model.wv.vectors.shape[1],
                        embeddings_initializer=Constant(model.wv.vectors), trainable=False, mask_zero=True))
    LSTM_model.add(LSTM(units=model.wv.vectors.shape[1]))
    LSTM_model.add(Dense(units=64, activation='relu'))
    LSTM_model.add(Dense(units=5, activation='softmax'))
    
    LSTM_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[])
    
    return LSTM_model

### Training

In [25]:
def train_LSTM(model):
    history = model.fit(x=corpus_train, y=labels_train, validation_data=(corpus_valid, labels_valid),
                        epochs=5)

    model.save('LSTM_model.h5')
    return history

In [26]:
labels_train, corpus_train = clead_data(labels_train, corpus_train)
labels_valid, corpus_valid = clead_data(labels_valid, corpus_valid)
LSTM_model = create_LSTM_model()
history = train_LSTM(LSTM_model)

MemoryError: Unable to allocate 12.2 GiB for an array with shape (2207185,) and data type <U1480

### Comments
The metrics and loss values indicates that the model is converging; however, after several thousands of samples it throws the following error: ###ERROR### LOG Due to a very long training time we didn't manage to find the source of these error. 