In [57]:
# Import the required libraries.
import re
import math
import random
import collections
import operator
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict

random.seed(11)
np.random.seed(11)

In [33]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Conv1D, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam, SGD
from keras.preprocessing.sequence import pad_sequences

In [3]:
def parse_sentence(sentence):
    '''
    Function for parsing the words and tags from the
    sentences of the input corpus.
    '''
    word_tag_pairs = sentence.split(" ")
    words = []
    tags = []

    for i, word_tag in enumerate(word_tag_pairs):
        word, tag = word_tag.strip().rsplit('/', 1)
        words.append(word)
        tags.append(tag)
        
    return words, tags

In [4]:
# Parse the sentences into a list.
parsed_sentences = []

with open('./Brown_train.txt', 'r') as file:
    sentences = file.readlines()

    for sentence in sentences:
        sentence = sentence.strip()
        parsed_sentences.append(parse_sentence(sentence))

In [5]:
def get_vocab(X_train, Y_train):
    '''
    Function for building the vocabulary from the training set of
    words and tags.
    '''
    vocabulary2id = dict()    
    tag2id = dict()
    vocabulary2id['UNK'] = 0
    vocabulary2id['PAD'] = 1

    for sent in X_train:
        for word in sent:
            if word not in vocabulary2id.keys():
                vocabulary2id[word] = len(vocabulary2id)
    
    tag2id['PAD'] = 0
    for sent in Y_train:
        for tag in sent:
            if tag not in tag2id.keys():
                tag2id[tag] = len(tag2id)
    
    return vocabulary2id, tag2id

def get_word_tag_counts(X_train, Y_train, vocabulary2id, tag2id):
    '''
    Function for calculating the counts pertaining to the
    individual word tags.
    '''
    wordcount = defaultdict(int)
    tagcount = defaultdict(int)
    tagpaircount = defaultdict(int)
    tagtriplecount = defaultdict(int)
    
    for sent in X_train:
        for word in sent:
            wordcount[word] += 1
    
    for sent in Y_train:
        for tag in sent:
            tagcount[tag] += 1
    
    for sent in Y_train:
        for i in range(len(sent) - 1):
            tagpaircount[sent[i], sent[i + 1]] += 1

    for sent in Y_train:
        for i in range(len(sent) - 2):
            tagtriplecount[sent[i], sent[i + 1], sent[i + 2]] += 1
    
    return wordcount, tagcount, tagpaircount, tagtriplecount

In [6]:
parsed_sentences[:5]

[(['At',
   'that',
   'time',
   'highway',
   'engineers',
   'traveled',
   'rough',
   'and',
   'dirty',
   'roads',
   'to',
   'accomplish',
   'their',
   'duties',
   '.'],
  ['ADP',
   'DET',
   'NOUN',
   'NOUN',
   'NOUN',
   'VERB',
   'ADJ',
   'CONJ',
   'ADJ',
   'NOUN',
   'PRT',
   'VERB',
   'DET',
   'NOUN',
   '.']),
 (['Using',
   'privately-owned',
   'vehicles',
   'was',
   'a',
   'personal',
   'hardship',
   'for',
   'such',
   'employees',
   ',',
   'and',
   'the',
   'matter',
   'of',
   'providing',
   'state',
   'transportation',
   'was',
   'felt',
   'perfectly',
   'justifiable',
   '.'],
  ['VERB',
   'ADJ',
   'NOUN',
   'VERB',
   'DET',
   'ADJ',
   'NOUN',
   'ADP',
   'ADJ',
   'NOUN',
   '.',
   'CONJ',
   'DET',
   'NOUN',
   'ADP',
   'VERB',
   'NOUN',
   'NOUN',
   'VERB',
   'VERB',
   'ADV',
   'ADJ',
   '.']),
 (['Once',
   'the',
   'principle',
   'was',
   'established',
   ',',
   'the',
   'increase',
   'in',
   'state-owned'

In [7]:
# Build the test and training sets of sentences.
kf = KFold(n_splits = 3, shuffle = False)
parsed_sentences = np.asarray(parsed_sentences)
scores = []
scores1 = []
y_pred_idx = []
y_pred_idx1 = []
y_test_idx = []
y_test_idx1 = []

preds = []

for train_index, test_index in kf.split(parsed_sentences):
    train_data = parsed_sentences[train_index]
    test_data = parsed_sentences[test_index]
    X_train = [a[0] for a in train_data]
    Y_train = [a[1] for a in train_data]
    X_test = [a[0] for a in test_data]
    Y_test = [a[1] for a in test_data]
    
    # Build the vocabulary and word counts.
    vocabulary2id, tag2id = get_vocab(X_train, Y_train)
    
    break


In [8]:
padlen = max(len(i) for i in X_train)
def pad(sentence, padid=vocabulary2id['PAD']):
    out = sentence[:padlen]
    padding = [padid for _ in range(padlen - len(out))]
    return out + padding

In [9]:
X_train_ids = np.asarray([pad([vocabulary2id[word] if word in vocabulary2id.keys() else vocabulary2id['UNK'] for word in sent]) for sent in X_train])
X_test_ids = np.array([pad([vocabulary2id[word] if word in vocabulary2id.keys() else vocabulary2id['UNK'] for word in sent]) for sent in X_test])

In [10]:
Y_train_ids = np.asarray([pad([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in sent], tag2id['PAD']) for sent in Y_train])
Y_test_ids = np.asarray([pad([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in sent], tag2id['PAD']) for sent in Y_test])

In [11]:
def id2onehot(Y, numtags):
    out = []
    for s in Y:
        categories = []
        for item in s:
            categories.append(np.zeros(numtags))
            categories[-1][item] = 1.0
        out.append(categories)
    return np.array(out)
 

In [12]:
Y_train_onehot = id2onehot(Y_train_ids, len(tag2id))
Y_test_onehot = id2onehot(Y_test_ids, len(tag2id))

In [13]:
model = Sequential()
model.add(InputLayer(input_shape=(padlen, )))
model.add(Embedding(len(vocabulary2id), 100))
model.add(Bidirectional(SimpleRNN(int((128+256)/2), return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2id))))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 172, 100)          2211500   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 172, 384)          112512    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 172, 13)           5005      
_________________________________________________________________
activation_1 (Activation)    (None, 172, 13)           0         
Total params: 2,329,017
Trainable params: 2,329,017
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(X_train_ids, Y_train_onehot, batch_size=128, epochs=5, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 14661 samples, validate on 3666 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f4c286dd6d8>

In [15]:
predictions = model.predict(X_test_ids)

In [16]:
test_accuracy = np.sum((Y_test_ids == np.argmax(predictions, axis=-1)) * (Y_test_ids != 0)) / np.sum((Y_test_ids != 0))
print(test_accuracy)

0.8796389249699855


In [17]:
predictions_argmax = np.argmax(predictions, axis=-1)

In [18]:
y_pred_nopad = []
y_true_nopad = []

for i in range(len(Y_test_ids)):
    for j in range(len(Y_test_ids[i])):
        if Y_test_ids[i][j] != 0 and predictions_argmax[i][j] != 0:
            y_true_nopad.append(Y_test_ids[i][j])
            if predictions_argmax[i][j] == 0:
                y_pred_nopad.append(1)
            else:
                y_pred_nopad.append(predictions_argmax[i][j])


In [19]:
y_pred_nopad = np.asarray(y_pred_nopad)
y_true_nopad = np.asarray(y_true_nopad)

In [20]:
(y_pred_nopad == y_true_nopad).mean()

0.8796929935992526

In [21]:
prec, rec, fscore, _ = precision_recall_fscore_support(y_true_nopad, y_pred_nopad, average = 'weighted')

  'precision', 'predicted', average, warn_for)


In [22]:
prec, rec, fscore

(0.8813291814812507, 0.8796929935992526, 0.8715809429532625)

In [47]:
model = Sequential()
model.add(InputLayer(input_shape=(padlen, )))
model.add(Embedding(len(vocabulary2id), 100))
model.add(Bidirectional(SimpleRNN(int((128+256)/2), return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2id))))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.003),
              metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 172, 100)          2211500   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 172, 384)          112512    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 172, 13)           5005      
_________________________________________________________________
activation_5 (Activation)    (None, 172, 13)           0         
Total params: 2,329,017
Trainable params: 2,329,017
Non-trainable params: 0
_________________________________________________________________


In [48]:
model.fit(X_train_ids, Y_train_onehot, batch_size=128, epochs=4, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 14661 samples, validate on 3666 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x7f4b147c9208>

In [49]:
predictions = model.predict(X_test_ids)

In [50]:
test_accuracy = np.sum((Y_test_ids == np.argmax(predictions, axis=-1)) * (Y_test_ids != 0)) / np.sum((Y_test_ids != 0))
print(test_accuracy)

0.9051051223319907


In [51]:
predictions_argmax = np.argmax(predictions, axis=-1)

In [52]:
y_pred_nopad = []
y_true_nopad = []

for i in range(len(Y_test_ids)):
    for j in range(len(Y_test_ids[i])):
        if Y_test_ids[i][j] != 0 and predictions_argmax[i][j] != 0:
            y_true_nopad.append(Y_test_ids[i][j])
            if predictions_argmax[i][j] == 0:
                y_pred_nopad.append(1)
            else:
                y_pred_nopad.append(predictions_argmax[i][j])


In [53]:
y_pred_nopad = np.asarray(y_pred_nopad)
y_true_nopad = np.asarray(y_true_nopad)

In [54]:
(y_pred_nopad == y_true_nopad).mean()

0.9053796495542576

In [55]:
prec, rec, fscore, _ = precision_recall_fscore_support(y_true_nopad, y_pred_nopad, average = 'weighted')

  'precision', 'predicted', average, warn_for)


In [56]:
prec, rec, fscore

(0.906992363947336, 0.9053796495542576, 0.9024334357862384)

In [62]:
print(classification_report(y_true_nopad, y_pred_nopad))

              precision    recall  f1-score   support

           1       0.99      0.99      0.99      8039
           2       0.87      0.82      0.84     10635
           3       1.00      1.00      1.00     26236
           4       0.87      0.90      0.88      4667
           5       0.87      0.90      0.89     35919
           6       0.85      0.91      0.88     63554
           7       0.96      0.96      0.96     35048
           8       0.98      0.99      0.99     29546
           9       0.88      0.69      0.77     20469
          10       0.70      0.96      0.81      4858
          11       0.96      0.43      0.60      4712
          12       0.00      0.00      0.00       292

    accuracy                           0.91    243975
   macro avg       0.83      0.80      0.80    243975
weighted avg       0.91      0.91      0.90    243975



In [63]:
tag2id

{'PAD': 0,
 'CONJ': 1,
 'ADV': 2,
 '.': 3,
 'PRT': 4,
 'VERB': 5,
 'NOUN': 6,
 'ADP': 7,
 'DET': 8,
 'ADJ': 9,
 'PRON': 10,
 'NUM': 11,
 'X': 12}