# Importing the packages

In [18]:
import pickle
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import (
    LSTM, 
    Embedding,
    Dense,
    TimeDistributed,
    Dropout,
    Bidirectional,
    Activation
)
from keras.callbacks import ModelCheckpoint

# Setting up the data processing functions.

In [14]:
def load_data_and_labels_one(filename,encoding='utf-8'):
    """This function is used for loading the data and converting the same 
    into a structured datastucture, other than splitting here I am removing 
    the special charecters specially.
    """
    words=[]
    pos=[]    
    sent=[]
    label1=[]
    tempz="""(),.<>?$#@"!%&*:;'~`^=-_+\|{}[]/"""
    removal=[i for i in tempz]
    with open(filename, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, pos_,parser_,ner_ = line.split()
                if word!="-DOCSTART-" and word not in removal and pos_ not in removal:
                    words.append(word)
                    pos.append(pos_)
            else:
                if words!=[] and pos!=[]:
                    sent.append(words)
                    label1.append(pos)
                    words,pos= [], []

    return sent, label1

def load_data_and_labels_two(filename,encoding='utf-8'):
    """
    This function also works same like the previous one but here 
    I am not removing the special charecters.
    """
    words=[]
    parser=[]
    ner=[]    
    sent=[]
    label2=[]
    label3=[]
    tempz="""(),.<>?$#@"!%&*:;'~`^=-_+\|{}[]/"""
    removal=[i for i in tempz]
    with open(filename, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, pos_,parser_,ner_ = line.split()
                if word!="-DOCSTART-" and parser_ not in removal and ner_ not in removal:
                    words.append(word)
                    parser.append(parser_)
                    ner.append(ner_)
            else:
                if words!=[] and ner!=[] and parser!=[]:
                    sent.append(words)
                    label2.append(parser)
                    label3.append(ner)
                    words, parser, ner = [], [], []

    return sent, label2, label3

# Helper Functions

In [15]:
def build_uniques(arr_x, arr_y):
    
    tmp_x, tmp_y = [], []
    
    for idx in arr_x: 
        for x in idx: 
            tmp_x.append(x)
  
    for idx in arr_y: 
        for x in idx:
            tmp_y.append(x)
    return list(set(tmp_x)), list(set(tmp_y))

## Function for word to index.
def word2idx(all_words): 

    tmp = {value: idx + 2 for idx, value in enumerate(all_words)}
    tmp["UNK"] = 1 
    tmp["PAD"] = 0
    
    return tmp
# Function for tag to index.
def tag2idx(all_tags):
    
    tmp = {value: idx + 1 for idx, value in enumerate(all_tags)}
    tmp["PAD"] = 0 
    
    return tmp
# Function for index to word.
def idx2word(word2idx):
    
    return {idx: value for value, idx in word2idx.items()}
# Function for index to tag.
def idx2tag(tag2idx):
    
    return {idx: value for value, idx in tag2idx.items()}


# Function used to convert the list of words and tags into a model friendly format.
def parser_arrays(MAX_LEN,x_train, y_train, all_words, all_tags):
    
    obj_word2idx = word2idx(all_words)
    obj_tag2idx = tag2idx(all_tags)
    
    __X = [[obj_word2idx[x] for x in value] for value in x_train] 
    __y = [[obj_tag2idx[x] for x in value] for value in y_train]

    #Terceira Parte
    X_pad = pad_sequences(maxlen=MAX_LEN, sequences=__X, padding="post", value=0)
    y_pad = pad_sequences(maxlen=MAX_LEN, sequences=__y, padding="post", value=0)
    
    return  X_pad, np.array([to_categorical(idx, num_classes=len(all_tags) + 1) for idx in y_pad])

# Function used during inference for returning the predicted tag with the original tag.  
def parser2categorical(pred, y_true, all_tags):
    
    k = tag2idx(all_tags)
    parser_idx = idx2tag(k)

    pred_tag = [[parser_idx[idx] for idx in row] for row in pred]
    y_true_tag = [[parser_idx[idx] for idx in row] for row in y_true] 
    
    return pred_tag, y_true_tag

# Model Architecture

In [19]:
def simple_model(NUM_WORDS,MAX_LEN,NUM_TAGS):
    model = Sequential()
    model.add(Embedding(input_dim=NUM_WORDS, output_dim=MAX_LEN,input_length=MAX_LEN))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(units=MAX_LEN,return_sequences=True,recurrent_dropout=0.1)))
    model.add(TimeDistributed(Dense(units=NUM_TAGS)))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam', 
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
    return model

# POS - Part of Speach - multi-class classification

## Loading the data and converting into model consumeable format.

In [16]:
X_train, y_train = load_data_and_labels_one("conll-2003/eng.train")
X_valid, y_valid= load_data_and_labels_one("conll-2003/eng.testa")
X_teste, y_teste= load_data_and_labels_one("conll-2003/eng.testb")

In [17]:
all_words, all_tags = build_uniques(X_train + X_valid + X_teste, y_train)

MAX_LEN = max([len(x) for x in X_train + X_valid + X_teste])
NUM_WORDS = len(all_words) + 2
NUM_TAGS = len(all_tags) + 1

X_train, y_train = parser_arrays(MAX_LEN,X_train, y_train, all_words, all_tags)
X_valid, y_valid = parser_arrays(MAX_LEN,X_valid, y_valid, all_words, all_tags)
X_teste, y_teste = parser_arrays(MAX_LEN,X_teste, y_teste, all_words, all_tags)

with open('encoding/X_teste_POS.pkl', 'wb') as f:
    pickle.dump(X_teste, f)
with open('encoding/y_teste_POS.pkl', 'wb') as f:
    pickle.dump(y_teste, f)
with open('encoding/all_words_POS.pkl', 'wb') as f:
    pickle.dump(all_words, f)
with open('encoding/all_tags_POS.pkl', 'wb') as f:
    pickle.dump(all_tags, f)
with open('encoding/MAX_LEN_POS.pkl', 'wb') as f:
    pickle.dump(MAX_LEN, f)

# Training the model and saving the weights for future use.

In [20]:
model=simple_model(NUM_WORDS,MAX_LEN,NUM_TAGS)
filename = 'checkpoints/model_PAR.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(
    X_train,
    y_train,
    batch_size=64,
    epochs=5,  
    validation_data = [X_valid, y_valid],  
    callbacks=[checkpoint],
    verbose=1
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 14011 samples, validate on 3242 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.39761, saving model to checkpoints/model_PAR.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.39761 to 0.24636, saving model to checkpoints/model_PAR.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.24636 to 0.12590, saving model to checkpoints/model_PAR.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.12590 to 0.07881, saving model to checkpoints/model_PAR.h5
Epoch 5/5

Epoch 00005: val_loss improved from 0.07881 to 0.06453, saving model to checkpoints/model_PAR.h5


<keras.callbacks.callbacks.History at 0x27e2f8255c0>

# Syntactic Chunk Tags - multi-class classification

## Loading the data and converting into model consumeable format and training the model.

In [21]:
X_train1, y_train1, y_train2 = load_data_and_labels_two("conll-2003/eng.train")
X_valid1, y_valid1, y_valid2= load_data_and_labels_two("conll-2003/eng.testa")
X_teste1, y_teste1, y_teste2= load_data_and_labels_two("conll-2003/eng.testb")


all_words, all_tags = build_uniques(X_train1 + X_valid1 + X_teste1, y_train1)

MAX_LEN = max([len(x) for x in X_train1 + X_valid1 + X_teste1])
NUM_WORDS = len(all_words) + 2
NUM_TAGS = len(all_tags) + 1

X_train, y_train = parser_arrays(MAX_LEN,X_train1, y_train1, all_words, all_tags)
X_valid, y_valid = parser_arrays(MAX_LEN,X_valid1, y_valid1, all_words, all_tags)
X_teste, y_teste = parser_arrays(MAX_LEN,X_teste1, y_teste1, all_words, all_tags)


with open('encoding/X_teste_PAR.pkl', 'wb') as f:
    pickle.dump(X_teste, f)
with open('encoding/y_teste_PAR.pkl', 'wb') as f:
    pickle.dump(y_teste, f)
with open('encoding/all_words_PAR.pkl', 'wb') as f:
    pickle.dump(all_words, f)
with open('encoding/all_tags_PAR.pkl', 'wb') as f:
    pickle.dump(all_tags, f)
with open('encoding/MAX_LEN_PAR.pkl', 'wb') as f:
    pickle.dump(MAX_LEN, f)

model=simple_model(NUM_WORDS,MAX_LEN,NUM_TAGS)
filename = 'checkpoints/model_PAR.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(
    X_train,
    y_train,
    batch_size=64,
    epochs=5,  
    validation_data = [X_valid, y_valid],  
    callbacks=[checkpoint],
    verbose=1
)

Train on 14041 samples, validate on 3250 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.14337, saving model to checkpoints/model_PAR.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.14337 to 0.04956, saving model to checkpoints/model_PAR.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.04956 to 0.03687, saving model to checkpoints/model_PAR.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.03687 to 0.03119, saving model to checkpoints/model_PAR.h5
Epoch 5/5

Epoch 00005: val_loss improved from 0.03119 to 0.02857, saving model to checkpoints/model_PAR.h5


<keras.callbacks.callbacks.History at 0x27e31974550>

# NER - multi-class classification

## Loading the data and converting into model consumeable format and training the model.

In [22]:
X_train1, y_train1, y_train2 = load_data_and_labels_two("conll-2003/eng.train")
X_valid1, y_valid1, y_valid2= load_data_and_labels_two("conll-2003/eng.testa")
X_teste1, y_teste1, y_teste2= load_data_and_labels_two("conll-2003/eng.testb")


all_words, all_tags = build_uniques(X_train1 + X_valid1 + X_teste1, y_train2)

MAX_LEN = max([len(x) for x in X_train1 + X_valid1 + X_teste1])
NUM_WORDS = len(all_words) + 2
NUM_TAGS = len(all_tags) + 1

X_train, y_train = parser_arrays(MAX_LEN,X_train1, y_train2, all_words, all_tags)
X_valid, y_valid = parser_arrays(MAX_LEN,X_valid1, y_valid2, all_words, all_tags)
X_teste, y_teste = parser_arrays(MAX_LEN,X_teste1, y_teste2, all_words, all_tags)


with open('encoding/X_teste_NER.pkl', 'wb') as f:
    pickle.dump(X_teste, f)
with open('encoding/y_teste_NER.pkl', 'wb') as f:
    pickle.dump(y_teste, f)
with open('encoding/all_words_NER.pkl', 'wb') as f:
    pickle.dump(all_words, f)
with open('encoding/all_tags_NER.pkl', 'wb') as f:
    pickle.dump(all_tags, f)
with open('encoding/MAX_LEN_NER.pkl', 'wb') as f:
    pickle.dump(MAX_LEN, f)

model=simple_model(NUM_WORDS,MAX_LEN,NUM_TAGS)
filename = 'checkpoints/model_NER.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(
    X_train,
    y_train,
    batch_size=64,
    epochs=5,  
    validation_data = [X_valid, y_valid],  
    callbacks=[checkpoint],
    verbose=1
)

Train on 14041 samples, validate on 3250 samples
Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.07415, saving model to checkpoints/model_NER.h5
Epoch 2/5

Epoch 00002: val_loss improved from 0.07415 to 0.03645, saving model to checkpoints/model_NER.h5
Epoch 3/5

Epoch 00003: val_loss improved from 0.03645 to 0.02381, saving model to checkpoints/model_NER.h5
Epoch 4/5

Epoch 00004: val_loss improved from 0.02381 to 0.01984, saving model to checkpoints/model_NER.h5
Epoch 5/5

Epoch 00005: val_loss improved from 0.01984 to 0.01845, saving model to checkpoints/model_NER.h5


<keras.callbacks.callbacks.History at 0x27e2a9024a8>