!pip install sklearn

# Part-of-speech tagging

### Import libraries

In [2]:
import os
import pandas as pd
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pickle
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from collections import OrderedDict
from keras.preprocessing.text import Tokenizer




### Define constant variables

In [3]:
train_path = './Dataset/train/'# Should we use: os.path.join(os.getcwd(), "OriginalDataset")
val_path = './Dataset/val/'
test_path = './Dataset/test/'
dataset_folder = os.path.join(os.getcwd(), "OriginalDataset")
original_path = dataset_folder +'/dependency_treebank/'

embedding_dimension = 50


### Download dataste

In [4]:
import urllib.request  #  download files
import zipfile  #  unzip files

def download_dataset():
    dataset_folder = os.path.join(os.getcwd(), "OriginalDataset")

    if not os.path.exists(dataset_folder):
        os.makedirs(dataset_folder)

    url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

    dataset_path = os.path.join(dataset_folder, "dependency_treebank.zip")

    if not os.path.exists(dataset_path):
        urllib.request.urlretrieve(url, dataset_path)
        print("Successful download")

    with zipfile.ZipFile(dataset_path,"r") as zip_ref:
        zip_ref.extractall(dataset_folder)
    print("Successful extraction")

In [5]:
download_dataset()

Successful extraction


### Split dataset

In [6]:
def split_dataset(train_path, val_path, test_path, original_path):
    if not os.path.exists(train_path):
        print("making directory")
        os.makedirs(train_path)
    if not os.path.exists(val_path):
        os.makedirs(val_path)
    if not os.path.exists(test_path):
        os.makedirs(test_path)

    original_dataset = os.listdir(original_path)
    
    original_dataset.sort()
    org_train = original_dataset[0:100]
    org_val = original_dataset[100:150]
    org_test = original_dataset[150:]

    for f in org_train:
        os.rename(original_path+f, train_path+f)
    for f in org_val:
        os.rename(original_path+f, val_path+f)
    for f in org_test:
        os.rename(original_path+f, test_path+f)
    print("Successful spilt")
        

In [7]:
split_dataset(train_path, val_path, test_path, original_path)


Successful spilt


### Preprocessing

In [8]:
def lower(lst): # DO WE NEED THIS?
    lowercase_list = [x.lower() for x in lst]
    return lowercase_list

### Create dataframe

In [9]:
def create_dataframe(dataset_path: str):
    dataframe_rows = []
    documents = os.listdir(dataset_path)
    for document in documents:
        path = os.path.join(dataset_path, document)
        with open(path, 'r') as f:
            a = [[x for x in ln.split()] for ln in f]
        a2 = [x for x in a if x != []] # OBS! Removing all empty lines in file so we can make an array
        arr = np.array(a2)
        text = list(arr[:, 0])
        tagg = list(arr[:, 1])
        #POStuple =  [tagg[x] for x in range(len(text))] #[(text[x], tagg[x]) for x in range(len(text))]
        document_id = int(document[4:8])
        dataframe_row = {
            "document_id": document_id,
            "text": text,
            "POS": tagg
        }
        dataframe_rows.append(dataframe_row)
    # transform the list of rows in a proper dataframe
    df = pd.DataFrame(dataframe_rows)
    df = df[["document_id", "text", "POS"]]

    return df

In [10]:
df_train = create_dataframe(train_path)
df_val = create_dataframe(val_path)
df_test = create_dataframe(test_path)

df_train

Unnamed: 0,document_id,text,POS
0,95,"[In, reference, to, your, Oct., 9, page-one, a...","[IN, NN, TO, PRP$, NNP, CD, NN, NN, ``, NNP, N..."
1,37,"[Judging, from, the, Americana, in, Haruki, Mu...","[VBG, IN, DT, NNS, IN, NNP, NNP, POS, ``, DT, ..."
2,66,"[Sir, Peter, Walters, ,, 58-year-old, chairman...","[NNP, NNP, NNP, ,, JJ, NN, IN, NNP, NNP, NNP, ..."
3,52,"[PAPERS, :, Backe, Group, Inc., agreed, to, ac...","[NNS, :, NNP, NNP, NNP, VBD, TO, VB, NNP, NNP,..."
4,3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
...,...,...,...
95,82,"[Criticism, in, the, U.S., over, recent, Japan...","[NNP, IN, DT, NNP, IN, JJ, JJ, NNS, VBZ, VBG, ..."
96,71,"[When, Warren, Winiarski, ,, proprietor, of, S...","[WRB, NNP, NNP, ,, NN, IN, NNP, POS, NNP, NNP,..."
97,20,"[The, U.S., ,, claiming, some, success, in, it...","[DT, NNP, ,, VBG, DT, NN, IN, PRP$, NN, NN, ,,..."
98,14,"[Norman, Ricken, ,, 52, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, CC, NN,..."


### Make tokenizer

In [11]:
word_tokenizer_train = Tokenizer()

# UNCOMMENT IF WE WANT TO INCLUDE GOLVE vocabualry
# word_tokenizer_train.fit_on_texts(embedding_model.index_to_key) 
word_tokenizer_train.fit_on_texts(df_train["text"])

x_encoded_train = word_tokenizer_train.texts_to_sequences(df_train["text"])

In [12]:
tag_tokenizer_train = Tokenizer()

tag_tokenizer_train.fit_on_texts(df_train["POS"])
y_encoded_train = tag_tokenizer_train.texts_to_sequences(df_train["POS"])

In [13]:
def pad_idx_seqs(idx_seqs, max_seq_len):
    # Keras provides a convenient padding function; 
    padded_idxs = pad_sequences(sequences=idx_seqs, maxlen=max_seq_len)
    return padded_idxs

In [14]:
def find_max_length(seqs):
    max_len = 0
    for seq in seqs:
        if len(seq) > max_len:
            max_len = len(seq)
    return max_len

In [15]:
x_padded_train = pad_idx_seqs(x_encoded_train, find_max_length(x_encoded_train))
y_padded_train = pad_idx_seqs(y_encoded_train, find_max_length(y_encoded_train))

### Embed the words using GloVe embedding

embeddings_dict = {}

#### Load glove model

In [16]:
import gensim
import gensim.downloader as gloader

def load_embedding_model(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    # Check download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Word2Vec: 300")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [17]:
embedding_model = load_embedding_model(embedding_dimension)

# NEW METHODE TO GET WORD TO INDX

In [18]:
VOCABULARY_SIZE = len(word_tokenizer_train.word_index) + 1

word2idx_train = word_tokenizer_train.word_index
idx2word_train = word_tokenizer_train.index_word
print(len(word2idx_train))


7404


In [19]:
tag2idx_train = tag_tokenizer_train.word_index
idx2tag_train = tag_tokenizer_train.index_word
len(tag2idx_train)

45

In [20]:
TAG_VOCABULARY_SIZE = len(tag2idx_train) + 1

### Create embedding matrix

In [21]:
# OBS Computes the OOV with random embeddings
def create_embedding_matrix(embedding_model, embedding_dimension, word_to_idx):
    embedding_matrix = np.zeros((len(word_to_idx)+1, embedding_dimension), dtype=np.float32)
    for word, idx in word_to_idx.items():
        try:
            embedding_vector = embedding_model[word]
        except (KeyError, TypeError):
            embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)

        embedding_matrix[idx] = embedding_vector
                                
    return embedding_matrix

def expand_embedding_matrix(embedding_model, embedding_dimension, word_to_idx, old_embedding_matrix):
    embedding_matrix = np.zeros((len(word_to_idx), embedding_dimension), dtype=np.float32)
    embedding_matrix[0:len(old_embedding_matrix)] = old_embedding_matrix
    
    for word, idx in word_to_idx.items():
        if idx >= len(old_embedding_matrix):
            try:
                embedding_vector = embedding_model[word]
            except (KeyError, TypeError):
                embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
                
            embedding_matrix[idx] = embedding_vector
    
    return embedding_matrix                                

In [22]:
embedding_matrix = create_embedding_matrix(embedding_model, embedding_dimension, word2idx_train)

#### Define x train and y train

In [23]:
x_train = x_padded_train
y_train = to_categorical(y_padded_train)

### Create baseline model

##### two layers architecture: a Bidirectional LSTM layer and a Dense/Fully-Connected layer on top


https://www.kaggle.com/tanyadayanand/pos-tagging-using-rnn/notebook

Not sure about the parameters etc.. but at least it works to train the model

In [24]:
def create_baseline_model():
    bidirect_model = keras.Sequential()
    bidirect_model.add(layers.Embedding(input_dim = VOCABULARY_SIZE,
                                 output_dim    = embedding_dimension,
                                 input_length  = find_max_length(x_train),
                                 weights       = [embedding_matrix],
                                 mask_zero     = True,
                                 trainable = False
                                ))
    bidirect_model.add(layers.Bidirectional(layers.LSTM(64, return_sequences=True)))
    #bidirect_model.add(layers.TimeDistributed(layers.Dense(len(tag_to_idx), activation='softmax')))
    bidirect_model.add(layers.Dense(TAG_VOCABULARY_SIZE, activation='softmax'))
    
    return bidirect_model

In [25]:
baseline_model = create_baseline_model()
baseline_model.summary()

2021-11-19 14:26:00.385710: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2900, 50)          370250    
                                                                 
 bidirectional (Bidirectiona  (None, 2900, 128)        58880     
 l)                                                              
                                                                 
 dense (Dense)               (None, 2900, 46)          5934      
                                                                 
Total params: 435,064
Trainable params: 64,814
Non-trainable params: 370,250
_________________________________________________________________


In [26]:
baseline_model.compile(loss="mse",
              optimizer="Adam",
              metrics=['mae', "acc"])
baseline_model.fit(x=x_train, batch_size=25, y=y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16a49fc10>