In [1]:
import nltk
# nltk.download() # Go to corpora tab and download brown and conll2000

In [2]:
from nltk.corpus import brown
from gensim.models import Word2Vec
import multiprocessing

In [3]:
sentences = brown.sents() # Going through the data of brown corpus and seeing a sample sentence
print(sentences[1])

['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']


In [4]:
# sentences -> sentences upon which we will train in our case brown sentences
# size -> embedding dimensions. Can take any value, usually around 300/500 preferred
# window -> these many words on the left and these many words on the right will be considered as context
# min_count -> Word should be present atleast these many times in corpus to be considered in our vocabulary
# negative -> Number of samples to be considered for negative sampling
# iter -> Number of iterations to train upon
# workers -> Number of threads to be put to work
emb_dim = 300
w2v = Word2Vec(sentences,size=emb_dim,window=5,min_count=5,negative=15,iter=10,workers=multiprocessing.cpu_count())

In [5]:
word_vectors = w2v.wv

In [12]:
w2v.shape

(15173, 300)

In [21]:
result = word_vectors.similar_by_word("boy")
print("Most similar words are:\n ",result[:3])

Most similar words are:
  [('girl', 0.9193911552429199), ('woman', 0.8401344418525696), ('man', 0.7483998537063599)]


We now have our own word embeddings ready which can be seen have produced quite good vectors!

Let's try using these word embeddings for some task! Say Part of Speech tagging where in every word of the sentence is tagged with what Part of Speech it is. We will use the conll2000 dataset for this purpose

In [22]:
from nltk.corpus import conll2000
from keras.layers import Dense, Embedding, Activation, Flatten
from keras.models import Sequential
from keras.utils import to_categorical
import numpy as np
import collections

In [23]:
# Retrieving the train and test data and seeing a sample
train_words = conll2000.tagged_words("train.txt")
test_words = conll2000.tagged_words("test.txt")
print(train_words[:5])

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ')]


We will convert our data into a more model-friendly format. For doing the same, lets split the words from their tags. Then map each word to the index at which it is present in our word2vec. But first let's comvert all POS to unique integer ID's as well

In [27]:
def get_tag_vocabulary(tagged_words):
    """
    Accepts text in the form of (word,pos) and returns a dictionary mapping all POS to unique id's
    """
    tag2id = {}
    for item in tagged_words:
        tag = item[1]
        tag2id.setdefault(tag, len(tag2id))
    return tag2id

# Right now our word2vec maps words to gensim objects but we would like a word to integer kind of mapping
# so we map words to the object indexes by the below line
word2id = {k:v.index for k,v in word_vectors.vocab.items()}
tag2id = get_tag_vocabulary(train_words)  # Returning our POS to unique id dictionary

We now have all POS tagged to unique IDs, also we have a word2vec dictionary in the form of a word->int mapping. We can now map all our words in training dataset to these word2id. However there is one issue i.e. there can be certain words in the training dataset which aren't present in our dictionary. To all such words we can assign the unknown tag(UNK). To do so, we must first add a UNK entry in our word2vec dictionary. Let's do that!

In [32]:
def add_new_word(new_word,new_vector,new_index,embedding_matrix,word2id):
    """
    Adds a new word to existing list of word embeddings
    """
    # inserting vector before given index, along axis 0
    embedding_matrix = np.insert(embedding_matrix, [new_index], [new_vector], axis=0)
    # Appending indexes of all words after new word by 1 so as to accomodate new word
    word2id = {word: (index+1) if index >= new_index else index for word, index in word2id.items()}
    word2id[new_word] = new_index
    return embedding_matrix, word2id

UNK_index = 0 # we will keep the unknown word at index 0
UNK_token = "UNK"

embedding_matrix = word_vectors.vectors
unk_vector = embedding_matrix.mean(0) # The value of UNK vector is average of all vectors
embedding_matrix, word2id = add_new_word(UNK_token,unk_vector, UNK_index, embedding_matrix,word2id)

Now lets associate all our training words with our word2id dictionary!

In [33]:
def get_int_data(tagged_words, word2id, tag2id):
    """
    Replace all words with their corresponding ids from our dictionary
    """
    x, y = [], []
    unk_count = 0
    
    for word, tag in tagged_words:
        y.append(tag2id.get(tag))
        if word in word2id:
            x.append(word2id.get(word))
        else:
            x.append(UNK_index)
            unk_count += 1
    print("Data created. Unknown data percentage: %.3f" % (unk_count/len(tagged_words)))
    return np.array(x), np.array(y)

x_train, y_train = get_int_data(train_words, word2id, tag2id)
x_test, y_test = get_int_data(test_words, word2id, tag2id)
y_train, y_test = to_categorical(y_train), to_categorical(y_test)

Data created. Unknown data percentage: 0.143
Data created. Unknown data percentage: 0.149


Now we have model friendly data. Let's define our model to classify words now. Our model will take as input an index into the word embedding matrix, which will be used to look up the appropriate embedding. It will have one hidden layer with the tanh activation function and at the final layer will use the softmax activation — outputting a probability distribution over all possible tags.

In [34]:
hidden_size = 50
batch_size = 128

def define_model(embedding_matrix, class_count):
    """
    Takes one word as input and returns its part of speech
    """
    vocab_length = len(embedding_matrix)
    model = Sequential()
    # Input dimension would be length of our vocabulary, output would be 300 dimensional embedding
    # We load our pretrainned word2vec weights and set the input size to be 1
    model.add(Embedding(input_dim=vocab_length,output_dim=300,weights=[embedding_matrix],input_length=1))
    model.add(Flatten())
    model.add(Dense(hidden_size,activation="tanh"))
    model.add(Dense(class_count,activation="softmax"))
    model.compile(optimizer="Adam",loss="categorical_crossentropy",metrics=["accuracy"])
    return model

pos_model = define_model(embedding_matrix, len(tag2id))
pos_model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 300)            4552200   
_________________________________________________________________
flatten_1 (Flatten)          (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                15050     
_________________________________________________________________
dense_2 (Dense)              (None, 44)                2244      
Total params: 4,569,494
Trainable params: 4,569,494
Non-trainable params: 0
_________________________________________________________________


In [35]:
pos_model.fit(x_train,y_train,epochs=1,verbose=1,batch_size=batch_size)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1


<keras.callbacks.History at 0x7f25443e7f98>

In [36]:
_, acc = pos_model.evaluate(x_test, y_test)
print("Accuracy: %.2f" % acc)

Accuracy: 0.85


Voila! 85% accuracy on the testing dataset with our model which passed one word at a time. What if we passed multiple words at a time to get some context?

 Something like {word1,word2,word3,word4,word5} and for this corresponding index passed will be of word3. So we could say in a way that we are passing word3 but along with it 2 neighboring words on either sides to our model. We will need to make certain changes in our model for this purpose.

 Also, we will need a token to be used for padding in case there are no 2 neighboring words present. We call this the EOS(end of sentence) word. We will add it to our dictionary similar to the UNK that we added

In [37]:
eos_index = 1
eos_tag = "EOS"
# Vector value will be kept random
eos_vector = np.random.standard_normal(300) # of embedding dimension size 300
embedding_matrix, word2id = add_new_word(eos_tag,eos_vector,eos_index,embedding_matrix,word2id)

Let's prepare our context dependent model

In [38]:
context_size = 2

def get_window_int_data(tagged_words, word2id, tag2id):
    x,y = [], []
    unk_count = 0
    
    span = 2*context_size+1 # total 5 words are being considered
    buffer = collections.deque(maxlen=span)
    padding = [(eos_tag, None)] * context_size
    buffer += padding + tagged_words[:context_size]
    
    for item in (tagged_words[context_size:] + padding):
        buffer.append(item)
        window_ids = np.array([word2id.get(word) if (word in word2id) else UNK_index for (word,_) in buffer])
        x.append(window_ids)
        
        middle_word, middle_tag = buffer[context_size]
        y.append(tag2id.get(middle_tag))
        
        if middle_word not in word2id:
            unk_count += 1
            
    print("Data created. Percentage of unknown words: %.3f" % (unk_count/len(tagged_words)))
    return np.array(x),np.array(y)

Let's define our model now which does the training part. Only thing that changes is embedding layer now takes input size as 5 and not 1

In [39]:
def define_context_model(embedding_matrix, class_count):
    """
    Takes word and its context as input and returns its part of speech
    """
    inp = 2*context_size + 1
    vocab_length = len(embedding_matrix)
    model = Sequential()
    # Input dimension would be length of our vocabulary, output would be 300 dimensional embedding
    # We load our pretrainned word2vec weights and set the input size to be 5
    model.add(Embedding(input_dim=vocab_length,output_dim=300,weights=[embedding_matrix],input_length=inp))
    model.add(Flatten())
    model.add(Dense(hidden_size,activation="tanh"))
    model.add(Dense(class_count,activation="softmax"))
    model.compile(optimizer="Adam",loss="categorical_crossentropy",metrics=["accuracy"])
    return model

In [40]:
x_train2, y_train2 = get_window_int_data(train_words, word2id, tag2id)
x_test2, y_test2 = get_window_int_data(test_words, word2id, tag2id)
y_train2, y_test2 = to_categorical(y_train2), to_categorical(y_test2)

Data created. Percentage of unknown words: 0.143
Data created. Percentage of unknown words: 0.149


In [41]:
cs_pos_model = define_context_model(embedding_matrix,len(tag2id))
cs_pos_model.fit(x_train2,y_train2,batch_size=batch_size,epochs=1,verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7f2551303cc0>

In [42]:
_, acc2 = cs_pos_model.evaluate(x_test2,y_test2)
print("Accuracy: %.2f" % acc2)

Accuracy: 0.91


91% accuracy holy molly hell!! Just by training on 1 epoch and with 2 context words on each side. Not to forget that we used our very own derived embeddings for this divine cause. Fcking party time! But before that, saving the model with their weights to avoid retraining in the future :1

In [43]:
pos_model.save('single_word_85test.h5')
cs_pos_model.save('context_word_91test.h5')

Milte hai agle episode mai, asta la vista!