!pip install sklearn

# Part-of-speech tagging

### Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pickle
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences



### Define constant variables

In [2]:
train_path = './Dataset/train/'# Should we use: os.path.join(os.getcwd(), "OriginalDataset")
val_path = './Dataset/val/'
test_path = './Dataset/test/'
dataset_folder = os.path.join(os.getcwd(), "OriginalDataset")
original_path = dataset_folder +'/dependency_treebank/'

embedding_dimension = 50


### Download dataste

In [3]:
import urllib.request  #  download files
import zipfile  #  unzip files

def download_dataset():
    dataset_folder = os.path.join(os.getcwd(), "OriginalDataset")

    if not os.path.exists(dataset_folder):
        os.makedirs(dataset_folder)

    url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

    dataset_path = os.path.join(dataset_folder, "dependency_treebank.zip")

    if not os.path.exists(dataset_path):
        urllib.request.urlretrieve(url, dataset_path)
        print("Successful download")

    with zipfile.ZipFile(dataset_path,"r") as zip_ref:
        zip_ref.extractall(dataset_folder)
    print("Successful extraction")

### Split dataset

In [4]:
def split_dataset(train_path, val_path, test_path, original_path):
    if not os.path.exists(train_path):
        print("making directory")
        os.makedirs(train_path)
    if not os.path.exists(val_path):
        os.makedirs(val_path)
    if not os.path.exists(test_path):
        os.makedirs(test_path)

    original_dataset = os.listdir(original_path)
    
    original_dataset.sort()
    org_train = original_dataset[0:100]
    org_val = original_dataset[100:150]
    org_test = original_dataset[150:]

    for f in org_train:
        os.rename(original_path+f, train_path+f)
    for f in org_val:
        os.rename(original_path+f, val_path+f)
    for f in org_test:
        os.rename(original_path+f, test_path+f)
    print("Successful spilt")
        

### Preprocessing

In [5]:
def lower(lst): # DO WE NEED THIS?
    lowercase_list = [x.lower() for x in lst]
    return lowercase_list

### Create dataframe

In [6]:
def create_dataframe(dataset_path: str):
    dataframe_rows = []
    documents = os.listdir(dataset_path)
    for document in documents:
        path = os.path.join(dataset_path, document)
        with open(path, 'r') as f:
            a = [[x for x in ln.split()] for ln in f]
        a2 = [x for x in a if x != []] # OBS! Removing all empty lines in file so we can make an array
        arr = np.array(a2)
        text = arr[:, 0]
        tagg = arr[:, 1]
        #POStuple =  [tagg[x] for x in range(len(text))] #[(text[x], tagg[x]) for x in range(len(text))]
        document_id = int(document[4:8])
        dataframe_row = {
            "document_id": document_id,
            "text": text,
            "POS": tagg
        }
        dataframe_rows.append(dataframe_row)
    # transform the list of rows in a proper dataframe
    df = pd.DataFrame(dataframe_rows)
    df = df[["document_id", "text", "POS"]]

    return df

### Embed the words using GloVe embedding

embeddings_dict = {}

#### Load glove model

In [7]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

#### Get vocabulary

In [8]:
def get_word_list(data_frame, lable):
    big_list = []
    text = data_frame[lable]
    for row in data_frame[lable]:
        big_list += row.tolist()

    word_list = set(big_list)
    return word_list

#### Find OOV

In [9]:
# Function definition

def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                    word_list):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(embedding_model.index_to_key) # Was previously: set(embedding_model.vocab.keys())
    oov = set(word_list).difference(embedding_vocabulary)
    return list(oov)


### Create word_to_index (for one-hot encoding)

In [10]:
from collections import OrderedDict

def create_word_to_idx(vocabulary):
    word_to_idx = OrderedDict()
    idx_to_word = OrderedDict()
    word_to_idx["PADDING"] = 0
    idx_to_word[0] = "PADDING"
    # Start from one. Index 0 is reserved for padding
    current_idx = 1
    for word in vocabulary:
        if word not in word_to_idx:
            word_to_idx[word] = current_idx
            idx_to_word[current_idx] = word
            current_idx += 1
            
    return word_to_idx, idx_to_word

def update_word_to_idx(old_idx_to_word, old_word_to_idx, new_words):
    word_to_idx = old_word_to_idx.copy()
    idx_to_word = old_idx_to_word.copy()
    current_idx = len(word_to_idx)
    for word in new_words:
        if word not in word_to_idx:
            word_to_idx[word] = current_idx
            idx_to_word[current_idx] = word
            current_idx += 1
            
    return word_to_idx, idx_to_word
    

In [11]:
def save_idx_word_list(idx_to_word,word_to_idx, write_path):
    if not os.path.exists(os.path.dirname(write_path)):
        try:
            os.makedirs(os.path.dirname(write_path))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    with open(write_path, 'wb') as f:
        pickle.dump(obj={"word_to_idx": words_lexicon, "idx_to_word":idx_to_word} , file=f)

### Create embedding matrix

In [12]:
# OBS Computes the OOV with random embeddings
def create_embedding_matrix(embedding_model, embedding_dimension, word_to_idx):
    embedding_matrix = np.zeros((len(word_to_idx), embedding_dimension), dtype=np.float32)
    for word, idx in word_to_idx.items():
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector
                                
    return embedding_matrix

def expand_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, old_embedding_matrix):
    embedding_matrix = np.zeros((len(word_to_idx), embedding_dimension), dtype=np.float32)
    embedding_matrix[0:len(old_embedding_matrix)] = old_embedding_matrix
    
    for word, idx in word_to_idx.items():
        if idx >= len(old_embedding_matrix):
            try:
                embedding_vector = embedding_model[word]
            except (KeyError, TypeError):
                embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
                
            embedding_matrix[idx] = embedding_vector
    
    return embedding_matrix                                

### Transform documents to sequences of integers

In [13]:
def sequence_to_integers(sequence, word_to_idx):
    integers = []
    for element in sequence:
        integers.append(word_to_idx[element])
    return np.array(integers)
"""OBS
Takes only one list of words to integer, not the list with lists.
"""

'OBS\nTakes only one list of words to integer, not the list with lists.\n'

In [14]:
def pad_idx_seqs(idx_seqs, max_seq_len):
    # Keras provides a convenient padding function; 
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len)
    return padded_idxs

In [15]:
def find_max_length(seqs):
    max_len = 0
    for seq in seqs:
        if len(seq) > max_len:
            max_len = len(seq)
    return max_len

In [16]:
def create_model(layers_info, compile_info) -> keras.Model:
    """
    Create a Keras model given a list of layer information

    :param layers_info: a list of dictionaries, one for each layer
    :param compile_info: dictionary containing compile information

    :return
        model: the built keras sequential model
    """

    print('Found {} total layers'.format(len(layers_info)))

    model = keras.Sequential()
    for info_idx, info in enumerate(layers_info):

        layer = info['layer'](**{key: value for key, value in info.items() if key != 'layer'})
        if info["name"] == "Bidirectional_LSTM_layer":
            model.add(layers.Bidirectional(layers.LSTM(64)))
        else:
            model.add(layer)

    # Debug
    model.summary()

    # Compile
    model.compile(**compile_info)

    return model

### Create baseline model

##### two layers architecture: a Bidirectional LSTM layer and a Dense/Fully-Connected layer on top


https://www.kaggle.com/tanyadayanand/pos-tagging-using-rnn/notebook

Not sure about the parameters etc.. but at least it works to train the model

In [17]:
def create_baseline_model():
    bidirect_model = keras.Sequential()
    bidirect_model.add(layers.Embedding(input_dim = len(word_to_idx_v3),
                                 output_dim    = embedding_dimension,
                                 input_length  = find_max_length(x_train),
                                 weights       = [embedding_matrix_v3],
                                 mask_zero     = True
                                ))
    bidirect_model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
    #bidirect_model.add(layers.TimeDistributed(layers.Dense(len(tag_to_idx), activation='softmax')))
    bidirect_model.add(layers.Dense(len(tag_to_idx), activation='softmax'))
    
    return bidirect_model

# Use the functions

In [18]:
download_dataset()
split_dataset(train_path, val_path, test_path, original_path)

Successful extraction
Successful spilt


In [19]:
df_train = create_dataframe(train_path)
df_val = create_dataframe(val_path)
df_test = create_dataframe(test_path)

df_train

Unnamed: 0,document_id,text,POS
0,95,"[In, reference, to, your, Oct., 9, page-one, a...","[IN, NN, TO, PRP$, NNP, CD, NN, NN, ``, NNP, N..."
1,37,"[Judging, from, the, Americana, in, Haruki, Mu...","[VBG, IN, DT, NNS, IN, NNP, NNP, POS, ``, DT, ..."
2,66,"[Sir, Peter, Walters, ,, 58-year-old, chairman...","[NNP, NNP, NNP, ,, JJ, NN, IN, NNP, NNP, NNP, ..."
3,52,"[PAPERS, :, Backe, Group, Inc., agreed, to, ac...","[NNS, :, NNP, NNP, NNP, VBD, TO, VB, NNP, NNP,..."
4,3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
...,...,...,...
95,82,"[Criticism, in, the, U.S., over, recent, Japan...","[NNP, IN, DT, NNP, IN, JJ, JJ, NNS, VBZ, VBG, ..."
96,71,"[When, Warren, Winiarski, ,, proprietor, of, S...","[WRB, NNP, NNP, ,, NN, IN, NNP, POS, NNP, NNP,..."
97,20,"[The, U.S., ,, claiming, some, success, in, it...","[DT, NNP, ,, VBG, DT, NN, IN, PRP$, NN, NN, ,,..."
98,14,"[Norman, Ricken, ,, 52, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, CC, NN,..."


In [20]:
embedding_model = load_embedding_model(embedding_dimension)

In [21]:
word_list_train = get_word_list(df_train, "text")
word_list_val = get_word_list(df_val, "text")
word_list_test = get_word_list(df_test, "text")

"""---------------------OBS------------------------
Where shoukld we get tags? Is it ok to assume they are in df_train?
"""
tags_list = get_word_list(df_train, "POS")

In [22]:
oov1_terms = check_OOV_terms(embedding_model, word_list_train)
#--------------------------OBS--------------------------------------------#
# To make sure OOV2 does not contain words from OOV1 a check is 
# implemented during the embedding in the word to index function. 
# It might be an idea to include this check in the chekc_OOV_terms as well. 
# Question for the professor?
#--------------------------OBS--------------------------------------------#
oov2_terms = check_OOV_terms(embedding_model, word_list_val)
oov3_terms = check_OOV_terms(embedding_model, word_list_test)


print("Total OOV1 terms: {0} ({1:.2f}%)".format(len(oov1_terms), float(len(oov1_terms)) / len(word_list_train)))
print("Total OOV2 terms: {0} ({1:.2f}%)".format(len(oov2_terms), float(len(oov2_terms)) / len(word_list_train)))
print("Total OOV3 terms: {0} ({1:.2f}%)".format(len(oov3_terms), float(len(oov3_terms)) / len(word_list_train)))

Total OOV1 terms: 2346 (0.29%)
Total OOV2 terms: 1524 (0.19%)
Total OOV3 terms: 957 (0.12%)


In [23]:
"""----------------------OBS------------------------------------
Should we also compute word to index for the test set and embed
these words, and then use all of them for training and validation
or should we keep the test set seperate and only compute this for
testing?
----------------------OBS------------------------------------"""
word_to_idx_v1, idx_to_word_v1 = create_word_to_idx(set(embedding_model.index_to_key))
word_to_idx_v2, idx_to_word_v2 = update_word_to_idx(idx_to_word_v1, word_to_idx_v1, oov1_terms)
word_to_idx_v3, idx_to_word_v3 = update_word_to_idx(idx_to_word_v2, word_to_idx_v2, oov2_terms)

tag_to_idx, idx_to_tag = create_word_to_idx(tags_list)

In [24]:
embedding_matrix_v1 = create_embedding_matrix(embedding_model, embedding_dimension, word_to_idx_v1)
embedding_matrix_v2 = expand_embedding_matrix(embedding_model, embedding_dimension, word_to_idx_v2, embedding_matrix_v1)
embedding_matrix_v3 = expand_embedding_matrix(embedding_model, embedding_dimension, word_to_idx_v3, embedding_matrix_v2)

print(embedding_matrix_v1.shape)
print(embedding_matrix_v2.shape)
print(embedding_matrix_v3.shape)

(400001, 50)
(402347, 50)
(403291, 50)


In [25]:
# Make train text and tags into sequence of integers
train_text = df_train["text"]
train_tags = df_train["POS"]

#---------------OBS----------------------------------------------------------------
#  Should x_train be numpy array?
#  Should we use Word to index for V3 or V2???
#---------------OBS----------------------------------------------------------------
x_train = [sequence_to_integers(sequence, word_to_idx_v3) for sequence in train_text]
y_train = [sequence_to_integers(sequence, tag_to_idx) for sequence in train_tags]


val_text = df_val["text"]
val_tags = df_val["POS"]
x_val = [sequence_to_integers(sequence, word_to_idx_v3) for sequence in val_text]
y_val = [sequence_to_integers(sequence, tag_to_idx) for sequence in val_tags]

In [26]:
"""-------------------------OBS--------------------------
Needs to transform y to binary matrix with to_categorical (not so sure exactly why)
----------------------------------------------------------"""

x_train = pad_idx_seqs(x_train, find_max_length(x_train))
y_train = to_categorical(pad_idx_seqs(y_train, find_max_length(y_train)))

x_val = pad_idx_seqs(x_val, find_max_length(x_val))
y_val = to_categorical(pad_idx_seqs(y_val, find_max_length(y_val)))

In [27]:
baseline_model = create_baseline_model()
baseline_model.summary()

2021-11-19 11:21:45.253102: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2900, 50)          20164550  
                                                                 
 bidirectional (Bidirectiona  (None, 2900, 128)        58880     
 l)                                                              
                                                                 
 dense (Dense)               (None, 2900, 46)          5934      
                                                                 
Total params: 20,229,364
Trainable params: 20,229,364
Non-trainable params: 0
_________________________________________________________________


In [29]:
baseline_model.compile(loss=keras.losses.BinaryCrossentropy(),
              optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['acc'])
baseline_model.fit(x=x_train, y=y_train, batch_size=20, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x179c93340>