## function to find words frequency, maximum line length and number of lines in a given file

In [9]:
import collections
def parse_sentences(filename):
    words_freq=collections.Counter()
    num_records,max_len=0,0
    fin=open(filename,'r')
    for line in fin:
        words=line.strip().lower().split()
        for word in words:
            words_freq[word]+=1
        if len(words)>max_len:
            max_len=len(words)
        num_records+=1
    return words_freq,max_len,num_records

## compute word frequency from treebank sentences and from treebank tags

In [10]:
import os
data_dir="/home/santhosh/keras/many-to-many(NER)/data"
s_words_freq,s_max_len,s_num_records=parse_sentences(os.path.join(data_dir,"treebank_sents.txt"))
t_words_freq,t_max_len,t_num_records=parse_sentences(os.path.join(data_dir,"treebank_poss.txt"))
print(len(s_words_freq),s_max_len,s_num_records)
print(len(t_words_freq),t_max_len,t_num_records)

10947 249 3914
45 249 3914


## from the above result, we can set max_line length=20 and set sentences vocabulary to 5000 and set tags vocabulary to 45

In [36]:
max_seqlen=20
s_max_feature=5000
t_max_feature=45

In [37]:
s_vocabsize=min(len(s_words_freq),s_max_feature)+2
t_vocabsize=t_max_feature+1

## generate word2index  and index2word for 5000 most common words in sentences

In [67]:
s_word2index={x[0]:i+2 for i,x in enumerate(s_words_freq.most_common(s_max_feature))}
s_word2index['PAD']=0
s_word2index['UNK']=1
s_index2word={v:k for k,v  in s_word2index.items()}
s_index2word

{0: 'PAD',
 1: 'UNK',
 2: ',',
 3: 'the',
 4: '.',
 5: 'of',
 6: 'to',
 7: 'a',
 8: 'in',
 9: 'and',
 10: "'s",
 11: 'for',
 12: 'that',
 13: '$',
 14: '``',
 15: "''",
 16: 'is',
 17: 'said',
 18: 'it',
 19: 'on',
 20: '%',
 21: 'by',
 22: 'at',
 23: 'as',
 24: 'with',
 25: 'from',
 26: 'million',
 27: 'mr.',
 28: 'are',
 29: 'was',
 30: 'be',
 31: 'its',
 32: 'has',
 33: 'an',
 34: 'new',
 35: 'have',
 36: "n't",
 37: 'but',
 38: 'he',
 39: 'or',
 40: 'will',
 41: 'they',
 42: 'company',
 43: '--',
 44: 'which',
 45: 'this',
 46: 'u.s.',
 47: 'says',
 48: 'year',
 49: 'about',
 50: 'would',
 51: 'more',
 52: 'were',
 53: 'market',
 54: 'their',
 55: 'than',
 56: 'stock',
 57: ';',
 58: 'trading',
 59: 'who',
 60: 'had',
 61: 'also',
 62: 'president',
 63: 'billion',
 64: 'up',
 65: 'one',
 66: 'been',
 67: 'some',
 68: ':',
 69: 'program',
 70: 'other',
 71: 'not',
 72: 'his',
 73: 'because',
 74: 'if',
 75: 'could',
 76: 'share',
 77: 'all',
 78: 'corp.',
 79: 'years',
 80: 'i',
 81

## generate word2index  and index2word for tags

In [65]:
t_word2index={x[0]:i+1 for i,x in enumerate(t_words_freq.most_common(t_max_feature))}
t_word2index['PAD']=0
t_index2word={v:k for k,v in t_word2index.items()}
t_index2word

{0: 'PAD',
 1: 'nn',
 2: 'in',
 3: 'nnp',
 4: 'dt',
 5: 'nns',
 6: 'jj',
 7: ',',
 8: '.',
 9: 'cd',
 10: 'vbd',
 11: 'rb',
 12: 'vb',
 13: 'cc',
 14: 'to',
 15: 'vbn',
 16: 'vbz',
 17: 'prp',
 18: 'vbg',
 19: 'vbp',
 20: 'md',
 21: 'pos',
 22: 'prp$',
 23: '$',
 24: '``',
 25: "''",
 26: ':',
 27: 'wdt',
 28: 'jjr',
 29: 'nnps',
 30: 'wp',
 31: 'rp',
 32: 'jjs',
 33: 'wrb',
 34: 'rbr',
 35: '-rrb-',
 36: '-lrb-',
 37: 'ex',
 38: 'rbs',
 39: 'pdt',
 40: '#',
 41: 'wp$',
 42: 'ls',
 43: 'fw',
 44: 'uh',
 45: 'sym'}

## now create a dataset to train


### X is going to contain rows of sentences and each row contains sequence of word index
### Y is going to contain rows of sentences and each row contains sequence of tags index in one hot vector

In [68]:
from keras.preprocessing import sequence
from keras.utils import np_utils
import numpy as np
def  build_tensor(filename,numrecords,word2index,maxlen,make_category=False,num_classes=0):
    data=np.empty((numrecords),dtype=list)
    fin=open(filename,'r')
    i=0
    for line in fin:
        words=line.strip().lower().split()
        wids=[]
        for word in words:
            if word in word2index:
                wids.append(word2index[word])
            else:
                wids.append(word2index['UNK'])
        #print(wids)
        if make_category:
            data[i]=np_utils.to_categorical(wids,num_classes=num_classes)
        else:
            data[i]=wids
        i+=1
    fin.close()
    pdata=sequence.pad_sequences(data,maxlen=maxlen)
    return pdata

In [69]:
X = build_tensor(os.path.join(data_dir,"treebank_sents.txt"),s_num_records,s_word2index,max_seqlen)

In [70]:
Y =build_tensor(os.path.join(data_dir,"treebank_poss.txt"),t_num_records,t_word2index,max_seqlen,True,t_vocabsize)

In [71]:
X[0],Y[0]

(array([   0,    0,    1, 4844,    2, 2148,   79,  317,    2,   40, 2703,
           3,  124,   23,    7, 2304,  318,  463, 2111,    4], dtype=int32),
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0],
        [0, 0, 0, 0

## preparing test train module 

In [72]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

## preparing our model

In [73]:
from keras.layers.core import Activation, Dense, Dropout, RepeatVector,SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential

## Training Parameters

In [74]:
EMBED_SIZE = 128
HIDDEN_SIZE = 128
BATCH_SIZE = 32
NUM_EPOCHS = 5
print(max_seqlen,t_vocabsize)
NUM_OF_ITERATION=20

20 46


## Training and Evaluating the model and then testing it

In [75]:
model = Sequential()
model.add(Embedding(s_vocabsize, EMBED_SIZE,
input_length=max_seqlen))
model.add(SpatialDropout1D(0.2))
model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(max_seqlen))
model.add(GRU(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))
#model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam",
metrics=["accuracy"])

for i in range(NUM_OF_ITERATION):
    print("Iteration: ",i+1)
    model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=[X_test,Y_test])
    score, acc = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
    print("Test score: %.3f, accuracy: %.3f" % (score, acc))
    #testing
    #generating random seed
    index=np.random.randint(s_num_records*0.8)
    pred_text=[s_index2word[i] for i in X_train[index]]
    test_pred=X_train[index].reshape(1,20)
    #print(Y_test[index].shape)
    pred=model.predict(test_pred,verbose=0)[0]
    labels=list()
    for tags in pred:
        max_index=np.argmax(tags)
        labels.append(t_index2word[max_index])
    print(pred_text,'\n\n',labels)

Iteration:  1
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 2.324, accuracy: 0.189
['$', '240', 'million', 'in', 'credit', 'and', 'loan', 'guarantees', 'in', 'fiscal', '1990', 'in', 'hopes', 'of', 'UNK', 'future', 'trade', 'and', 'investment', '.'] 

 ['nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'in', 'cd', 'cd', '.', '.']
Iteration:  2
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 2.234, accuracy: 0.213
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'continued', 'export', 'demand', 'also', 'supported', 'prices', '.'] 

 ['nnp', 'nnp', 'nnp', 'nnp', 'nnp', 'nnp', 'nnp', 'nnp', 'nnp', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', '.']
Iteration:  3
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 2.18

Epoch 4/5
Epoch 5/5
Test score: 1.814, accuracy: 0.313
['PAD', 'moody', "'s", 'said', 'those', 'returns', 'compare', 'with', 'a', '3.8', '%', 'total', 'return', 'for', 'longer-term', 'treasury', 'notes', 'and', 'bonds', '.'] 

 ['nnp', 'nnp', 'nnp', 'nnp', 'vbd', 'vbd', 'dt', 'dt', 'dt', 'nn', 'nn', 'jj', 'jj', 'nns', 'nns', 'nns', 'cc', 'nns', 'nns', '.']
Iteration:  9
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 1.807, accuracy: 0.315
['canada', 'UNK', '%', ';', 'germany', '9', '%', ';', 'japan', 'UNK', '%', ';', 'switzerland', '8.50', '%', ';', 'britain', '15', '%', '.'] 

 ['nnp', 'nnp', 'nn', ':', ':', ':', ':', ':', ':', ':', ':', ':', 'nnp', 'nnp', 'nnp', ':', 'cd', 'cd', 'cd', '.']
Iteration:  10
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 1.815, accuracy: 0.314
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', '``', 'you', "'ve", 'got',

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 1.855, accuracy: 0.338
['would', 'UNK', 'dividends', 'at', 'a', '12', '%', 'rate', ',', 'but', 'would', "n't", 'be', 'paid', 'for', 'the', 'first', 'two', 'years', '.'] 

 ['md', 'vb', 'vb', 'in', 'in', 'nn', 'cc', 'cc', 'cc', 'cc', 'md', 'vb', 'vb', 'vbn', 'vbn', 'in', 'jj', 'cd', 'nns', '.']
Iteration:  17
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 1.853, accuracy: 0.338
['of', 'utilities', ',', 'many', 'in', 'the', 'west', ',', 'that', 'already', 'have', 'added', 'expensive', 'UNK', 'equipment', 'or', 'UNK', 'UNK', 'UNK', '.'] 

 [',', ',', ',', 'in', 'dt', 'dt', 'dt', ',', ',', 'vbp', 'vbp', 'vbn', 'jj', 'nn', 'nn', 'cc', 'cc', 'jj', 'nns', '.']
Iteration:  18
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 1.884, accuracy: 0.341
['working', 'days', 'to', 'UNK', 'the', 'citations', 'and', 'proposed', 'pena

## Training Parameters

In [77]:
EMBED_SIZE = 128
HIDDEN_SIZE = 128
BATCH_SIZE = 64
NUM_EPOCHS = 5
print(max_seqlen,t_vocabsize)
NUM_OF_ITERATION=100

20 46


In [None]:
model = Sequential()
model.add(Embedding(s_vocabsize, EMBED_SIZE,
input_length=max_seqlen))
model.add(SpatialDropout1D(0.2))
model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(max_seqlen))
model.add(GRU(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))
#model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam",
metrics=["accuracy"])

for i in range(NUM_OF_ITERATION):
    print("Iteration: ",i+1)
    model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=[X_test,Y_test])
    score, acc = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
    print("Test score: %.3f, accuracy: %.3f" % (score, acc))
    #testing
    #generating random seed
    index=np.random.randint(s_num_records*0.8)
    pred_text=[s_index2word[i] for i in X_train[index]]
    test_pred=X_train[index].reshape(1,20)
    #print(Y_test[index].shape)
    pred=model.predict(test_pred,verbose=0)[0]
    labels=list()
    for tags in pred:
        max_index=np.argmax(tags)
        labels.append(t_index2word[max_index])
    print(pred_text,'\n\n',labels)

Iteration:  1
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 2.385, accuracy: 0.187
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'continuing', 'demand', 'for', 'dollars', 'from', 'japanese', 'investors', 'boosted', 'the', 'u.s.', 'currency', '.'] 

 ['nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', '.']
Iteration:  2
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test score: 2.299, accuracy: 0.195
['to', 'position', 'himself', 'as', 'a', 'friendly', 'investor', 'who', 'could', 'help', 'ual', 'chairman', 'stephen', 'wolf', 'revive', 'a', 'failed', 'labor-management', 'bid', '.'] 

 ['nnp', 'nnp', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', 'nn', '.', '.']
Iteration:  3
Train on 3131 samples, validate on 783 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
E