In [36]:
import pyconll, keras, pickle, os, random, nltk, datetime, warnings, gc, urllib.request, zipfile
import numpy as np
import math
from scipy.sparse import hstack, vstack
import matplotlib.pyplot as plt
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import FastText
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV, learning_curve, cross_val_score
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, BatchNormalization, Dropout, Input, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.utils import np_utils
from numpy.random import seed
from sklearn.model_selection import train_test_split

In [2]:
ROOT_DIR = os.path.dirname("/home/chitrang/Documents/CSE-582/") # setting the root dir
POS_DIR = os.path.join(ROOT_DIR, 'dataset')
pos_train = os.path.join(POS_DIR, "train.txt")

In [3]:
def format_data(fname, include_y=True):
    sentences = [] # master list
    with open(fname) as f:
        content = f.readlines()
    
    sentence = [] # local list
    for line in content:
        if line !='\n':
            line = line.strip() # remove leading/trailing spaces
            word = line.split()[0].lower() # get the word
            if include_y:
                pos = ""
                pos = line.split()[1] # get the pos tag
                sentence.append((word, pos)) # create a pair and save to local list
            else:
                sentence.append(word)
        else:
            sentences.append(sentence) # once a \n is detected, append the local sentence to master sentence
            sentence = []
    return sentences

In [4]:
train_sentences = format_data(pos_train)

In [5]:
print("Tagged sentences in train set: ", len(train_sentences))
print("Tagged words in train set:", len([item for sublist in train_sentences for item in sublist]))

Tagged sentences in train set:  8936
Tagged words in train set: 211727


In [6]:
# # Dataset Split
# total_train = len(train_sentences)
# print(total_train)
# val_split = math.floor(0.2 * total_train)
# print(val_split)
# train_sentences, val_sentences = train_sentences[:(total_train-val_split)], train_sentences[(total_train-val_split):]
# print(len(train_sentences), len(val_sentences))
# print(len(train_sentences)+len(val_sentences))

In [7]:
# print(train_sentences[0])

In [8]:
def tag_sequence(sentences):
    return [[t for w, t in sentence] for sentence in sentences]

def text_sequence(sentences):
    return [[w for w, t in sentence] for sentence in sentences]

def id2word(sentences):
    wordlist = [item for sublist in text_sequence(sentences) for item in sublist]
    id2word = {k:v for k,v in enumerate(wordlist)}
    return id2word

def untag(tagged_sentence):
    return [w for w, _ in tagged_sentence]

def untag_pos(tagged_sentence):
    return [t for _, t in tagged_sentence]

def build_vocab(sentences):
    vocab =set()
    for sentence in sentences:
        for word in untag(sentence):
            vocab.add(word)
    return sorted(list(vocab))

In [11]:
embs_path = '/home/chitrang/Downloads/wiki-news-300d-1M.vec'
embeddings = KeyedVectors.load_word2vec_format(embs_path, binary=False)

In [19]:
w2c = dict()
for item in embeddings.key_to_index:
    w2c[item] = embeddings.key_to_index[item]

In [20]:
dim = embeddings.vectors.shape[1]
pad = np.zeros(dim)
np.random.seed(3)
oov = np.random.uniform(-0.25, 0.25, dim)

In [21]:
def features_embs(sentence, index, window=1):
    unknown=0
    vec = np.array([])
    for i in range(index-window, index+window+1):
#         if i < 0:
#             vec = np.append(vec, pad)
#         if i > len(sentence)-1:
#             vec = np.append(vec, pad)
        try:
            vec = np.append(vec, embeddings[sentence[i]])
        except:
            vec = np.append(vec, oov)
            unknown += 1
    return vec, unknown

In [22]:
def features_basic(sentence, index):
    return {
        'nb_terms': len(sentence),        
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'i-1_prefix-3': '' if index == 0 else sentence[index-1][:3],        
        'i-1_suffix-3': '' if index == 0 else sentence[index-1][-3:],        
        'i+1_prefix-3': '' if index == len(sentence) - 1 else sentence[index+1][:3],        
        'i+1_suffix-3': '' if index == len(sentence) - 1 else sentence[index+1][-3:],        
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
    }

In [23]:
def transform_to_dataset(tagged_sentences, window):
    i=0
    X, y = [], []
    for doc_index, tagged in enumerate(tagged_sentences):
        for index in range(len(tagged)):
            X.append([features_basic(untag(tagged), index),\
                      features_embs(untag(tagged), index, window)[0],\
                     ])
            y.append(tagged[index][1])
            k = features_embs(untag(tagged), index, window)[1]
            i += k
    return X, y, i

In [27]:
def transform_test_sentence(sentence, window):
    X = []
    for index in range(len(sentence)):
            X.append([
                      features_basic(sentence, index),\
                      features_embs(sentence, index, window),\
                     ])
    return X

In [32]:
def vectorize(train, window=1):
    print('Vectorizing Dataset...')
    X_train, y_train, unk_tr = transform_to_dataset(train, window=window)
    X_train = [x[1] for x in X_train]
    X_train = np.asarray(X_train)
    return X_train, y_train

In [39]:
X_train, y_train = vectorize(train_sentences)

Vectorizing Dataset...


In [40]:
X_train.shape

(211727, 900)

In [41]:
classes = sorted(list(set(y_train)))
print(classes)

['#', '$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']


In [42]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [43]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
# y_val = le.transform(y_val)
y_train = keras.utils.to_categorical(y_train)
# y_val = keras.utils.to_categorical(y_val)

print(y_train.shape)

(211727, 44)


In [35]:
# model = Sequential()
# # In the first layer, we specify the input data shape

# model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
# model.add(Dropout(0.5))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.4))
# model.add(Dense(y_train.shape[1], activation='softmax'))

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.summary()

In [36]:
# nb_epoch = 10
# batch_size = 128
# early_stopping = EarlyStopping(monitor = 'val_acc', patience = 5)
# history = model.fit(X_train, y_train,
#                     epochs=nb_epoch,
#                     batch_size=batch_size,
#                     shuffle=True,
#                     validation_data=(X_val, y_val),
#                     verbose=1,
#                     callbacks=[early_stopping])

In [37]:
# X_full_train = vstack((X_train, X_val)).tocsr()
# y_full_train = np.append(y_train, y_val, axis=0)

In [38]:
# X_train.shape

(169381, 70531)

In [118]:
model = Sequential()

model.add(Dense(512, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 512)               461312    
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_10 (Dense)            (None, 64)                32832     
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_11 (Dense)            (None, 44)                2860      
                                                                 
Total params: 497,004
Trainable params: 497,004
Non-trainable params: 0
_________________________________________________________________


In [119]:
model.fit(X_train, y_train,
                    epochs=20,
                    batch_size=32,
                    verbose=1,
                   ) 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9d6a762910>

In [142]:
# model.save("my_model.h5")

In [45]:
# model = keras.models.load_model(save_format='h5', "my_model.h5")

2023-03-10 21:59:31.257721: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-10 21:59:31.257913: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-10 21:59:31.260761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-03-10 21:59:31.261050: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2023-03-10 21:59:31.261091: W tensorflow/c

In [120]:
pos_test = os.path.join(POS_DIR, "test.txt")

In [121]:
test_sentences = format_data(pos_test, False)

In [122]:
dim = embeddings.vectors.shape[1]
pad = np.zeros(dim)

In [123]:
pos_test_correct = '/home/chitrang/Downloads/test.txt'

In [124]:
correct_test_sen = format_data(pos_test_correct)

In [125]:
def compare_results(predicted_sen, correct_sen, acc = True):
    total = 0
    correct_total = 0
    if acc:
        for predict, correct in zip(predicted_sen, correct_sen):
                if predict[1] == correct[1]:
                    correct_total = correct_total + 1
                total = total + 1
        acc = correct_total / total
        return acc
    else:
        for predict, correct in zip(predicted_sen, correct_sen):
                if predict[1] == correct[1]:
                    print(f"{predict[1]} \t {correct[1]}")
                else:
                    print(f"{predict[1]} \t {correct[1]} <------ Error")

In [126]:
test_sentences = format_data(pos_test, False)
preprocessed_test_data = []
def embed_test_sentences(sentence):
    X_embs = [x[1][0] for x in sentence]
    X_embs = np.asarray(X_embs)
    return X_embs

def preprocess_unlabelled_test_data(test_sentences):
    for sentence in test_sentences:
        sentence = transform_test_sentence(sentence, 1)
        embedded = embed_test_sentences(sentence)
        preprocessed_test_data.append(embedded)


In [127]:
preprocess_unlabelled_test_data(test_sentences)

In [128]:
predicted_data = []
arg_max_dict = []
def test_set_predictions(preprocessed_test_data, test_sentences):
    for sentence in preprocessed_test_data:
        predict_x=model.predict(sentence, batch_size=1, verbose=0) 
        predict_x = np.argmax(predict_x, axis=1)
        arg_max_dict.append(predict_x)
        
    for index in range(len(test_sentences)):
        predicted_sen = list(zip(test_sentences[index], le.inverse_transform(arg_max_dict[index])))
        predicted_data.append(predicted_sen)


In [129]:
test_set_predictions(preprocessed_test_data, test_sentences)

In [130]:
predicted_data

[[('rockwell', 'NNP'),
  ('international', 'NNP'),
  ('corp.', 'NNP'),
  ("'s", 'POS'),
  ('tulsa', 'NNP'),
  ('unit', 'VBP'),
  ('said', 'VBD'),
  ('it', 'PRP'),
  ('signed', 'VBD'),
  ('a', 'DT'),
  ('tentative', 'JJ'),
  ('agreement', 'NN'),
  ('extending', 'VBG'),
  ('its', 'PRP$'),
  ('contract', 'NN'),
  ('with', 'IN'),
  ('boeing', 'NNP'),
  ('co.', 'NNP'),
  ('to', 'TO'),
  ('provide', 'VB'),
  ('structural', 'JJ'),
  ('parts', 'NNS'),
  ('for', 'IN'),
  ('boeing', 'NNP'),
  ("'s", 'POS'),
  ('747', 'CD'),
  ('jetliners', 'NNS'),
  ('.', '.')],
 [('rockwell', 'NNP'),
  ('said', 'VBD'),
  ('the', 'DT'),
  ('agreement', 'NN'),
  ('calls', 'VBZ'),
  ('for', 'IN'),
  ('it', 'PRP'),
  ('to', 'TO'),
  ('supply', 'VB'),
  ('200', 'CD'),
  ('additional', 'JJ'),
  ('so-called', 'JJ'),
  ('shipsets', 'NN'),
  ('for', 'IN'),
  ('the', 'DT'),
  ('planes', 'NNS'),
  ('.', '.')],
 [('these', 'DT'),
  ('include', 'VBP'),
  (',', ','),
  ('among', 'IN'),
  ('other', 'JJ'),
  ('parts', 'NNS'),


In [137]:
def compare_with_test_set(correct_set):
    total = 0
    correct = 0
    for predicted_sentence, correct_sentence in zip(predicted_data, correct_set):
        for predicted_word, correct_word in zip(predicted_sentence, correct_sentence):
            total = total + 1
            if predicted_word[1] == correct_word[1]:
                print(predicted_word[0], predicted_word[1], correct_word[1])
                correct = correct + 1
    
    accuracy = (correct / total) * 100
    return accuracy

In [138]:
compare_with_test_set(correct_test_sen)

rockwell NNP NNP
international NNP NNP
corp. NNP NNP
's POS POS
tulsa NNP NNP
said VBD VBD
it PRP PRP
signed VBD VBD
a DT DT
tentative JJ JJ
agreement NN NN
extending VBG VBG
its PRP$ PRP$
contract NN NN
with IN IN
boeing NNP NNP
co. NNP NNP
to TO TO
provide VB VB
structural JJ JJ
parts NNS NNS
for IN IN
boeing NNP NNP
's POS POS
747 CD CD
jetliners NNS NNS
. . .
rockwell NNP NNP
said VBD VBD
the DT DT
agreement NN NN
calls VBZ VBZ
for IN IN
it PRP PRP
to TO TO
supply VB VB
200 CD CD
additional JJ JJ
so-called JJ JJ
for IN IN
the DT DT
planes NNS NNS
. . .
these DT DT
include VBP VBP
, , ,
among IN IN
other JJ JJ
parts NNS NNS
, , ,
each DT DT
jetliner NN NN
's POS POS
two CD CD
major JJ JJ
bulkheads NNS NNS
, , ,
a DT DT
pressure NN NN
floor NN NN
, , ,
torque NN NN
box NN NN
, , ,
fixed VBN VBN
leading VBG VBG
edges NNS NNS
for IN IN
the DT DT
wings NNS NNS
and CC CC
an DT DT
beam NN NN
. . .
under IN IN
the DT DT
existing VBG VBG
contract NN NN
, , ,
rockwell NNP NNP
said VBD VBD
, 

years NNS NNS
old JJ JJ
, , ,
was VBD VBD
chief JJ JJ
executive NN NN
of IN IN
the DT DT
distributor NN NN
of IN IN
bearings NNS NNS
and CC CC
power-transmission JJ JJ
products NNS NNS
from IN IN
1982 CD CD
to TO TO
1988 CD CD
. . .
he PRP PRP
will MD MD
continue VB VB
as IN IN
a DT DT
director NN NN
. . .
mr. NNP NNP
lamore NNP NNP
, , ,
63 CD CD
, , ,
a DT DT
48-year JJ JJ
veteran NN NN
at IN IN
, , ,
has VBZ VBZ
been VBN VBN
president NN NN
since IN IN
1983 CD CD
. . .
mr. NNP NNP
dannemiller NNP NNP
, , ,
51 CD CD
, , ,
joined VBD VBD
in IN IN
august NNP NNP
1988 CD CD
from IN IN
transportation NNP NNP
corp. NNP NNP
, , ,
where WRB WRB
he PRP PRP
was VBD VBD
president NN NN
and CC CC
chief JJ JJ
operating VBG VBG
officer NN NN
. . .
he PRP PRP
has VBZ VBZ
been VBN VBN
a DT DT
director NN NN
since IN IN
1985 CD CD
. . .
the DT DT
appointments NNS NNS
are VBP VBP
part NN NN
of IN IN
a DT DT
planned VBN VBN
succession NN NN
at IN IN
the DT DT
company NN NN
. . .
soviet JJ JJ
leader NN

95.74688139814678

In [133]:
def generate_labelled_data(file_name):
    f = open(file_name, "w")
    for sentence in predicted_data:
        for word, pos in sentence:
            f.write(f"{word} {pos}\n")
        f.write(f"\n")
    f.close()

In [171]:
# Ensemble
predicted_data_from_MLP = format_data('labelled_from_MLP.txt')
predicted_data_from_logR = format_data('./../LR/output.txt')
# predicted_data_from_LogR = load_ensemble_model()

FileNotFoundError: [Errno 2] No such file or directory: './../LR/output.txt'

In [169]:
def ensemble_accuracy(predicted_data_from_MLP, predicted_data_from_logR, correct_test_set):
    total = 0
    correct = 0
    for MLP_data, logR_data, correct_data in zip(predicted_data_from_MLP, predicted_data_from_logR, correct_test_set):
        for MLP_tuple, logR_tuple, correct_data_tuple in zip(MLP_data, logR_data, correct_data):
            if MLP_tuple[1] == correct_data_tuple[1] or logR_tuple[1] == correct_data_tuple[1]:
                correct = correct + 1
            total = total + 1
    acc = ( correct / total ) * 100
    return acc

In [170]:
ensemble_accuracy(predicted_data_from_MLP, predicted_data_from_logR, correct_test_sen)

96.57217637250143