## function to find words frequency, maximum line length and number of lines in a given file

In [1]:
import collections
def parse_sentences(filename):
    words_freq=collections.Counter()
    num_records,max_len=0,0
    fin=open(filename,'r')
    for line in fin:
        words=line.strip().lower().split()
        for word in words:
            words_freq[word]+=1
        if len(words)>max_len:
            max_len=len(words)
        num_records+=1
    return words_freq,max_len,num_records

## compute word frequency from treebank sentences and from treebank tags

In [2]:
import os
data_dir="/home/santhosh/keras/many-to-many(NER)/data"
s_words_freq,s_max_len,s_num_records=parse_sentences(os.path.join(data_dir,"treebank_sents.txt"))
t_words_freq,t_max_len,t_num_records=parse_sentences(os.path.join(data_dir,"treebank_poss.txt"))
print(len(s_words_freq),s_max_len,s_num_records)
print(len(t_words_freq),t_max_len,t_num_records)

10947 249 3914
45 249 3914


## from the above result, we can set max_line length=250 and set sentences vocabulary to 5000 and set tags vocabulary to 45

In [16]:
max_seqlen=250
s_max_feature=5000
t_max_feature=45

In [17]:
s_vocabsize=min(len(s_words_freq),s_max_feature)+2
t_vocabsize=t_max_feature+1

## generate word2index  and index2word for 5000 most common words in sentences

In [18]:
s_word2index={x[0]:i+2 for i,x in enumerate(s_words_freq.most_common(s_max_feature))}
s_word2index['PAD']=0
s_word2index['UNK']=1
s_index2word={v:k for k,v  in s_word2index.items()}
s_index2word

{0: 'PAD',
 1: 'UNK',
 2: ',',
 3: 'the',
 4: '.',
 5: 'of',
 6: 'to',
 7: 'a',
 8: 'in',
 9: 'and',
 10: "'s",
 11: 'for',
 12: 'that',
 13: '$',
 14: '``',
 15: "''",
 16: 'is',
 17: 'said',
 18: 'it',
 19: 'on',
 20: '%',
 21: 'by',
 22: 'at',
 23: 'as',
 24: 'with',
 25: 'from',
 26: 'million',
 27: 'mr.',
 28: 'are',
 29: 'was',
 30: 'be',
 31: 'its',
 32: 'has',
 33: 'an',
 34: 'new',
 35: "n't",
 36: 'have',
 37: 'but',
 38: 'he',
 39: 'or',
 40: 'will',
 41: 'they',
 42: 'company',
 43: '--',
 44: 'which',
 45: 'this',
 46: 'u.s.',
 47: 'says',
 48: 'year',
 49: 'about',
 50: 'would',
 51: 'more',
 52: 'were',
 53: 'market',
 54: 'their',
 55: 'than',
 56: 'stock',
 57: ';',
 58: 'trading',
 59: 'who',
 60: 'had',
 61: 'also',
 62: 'president',
 63: 'billion',
 64: 'up',
 65: 'one',
 66: 'been',
 67: 'some',
 68: ':',
 69: 'program',
 70: 'not',
 71: 'other',
 72: 'his',
 73: 'because',
 74: 'if',
 75: 'could',
 76: 'share',
 77: 'corp.',
 78: 'all',
 79: 'years',
 80: 'i',
 81

## generate word2index  and index2word for tags

In [19]:
t_word2index={x[0]:i for i,x in enumerate(t_words_freq.most_common(t_max_feature))}
t_word2index['PAD']=0
t_index2word={v:k for k,v in t_word2index.items()}
t_index2word

{0: 'PAD',
 1: 'in',
 2: 'nnp',
 3: 'dt',
 4: 'nns',
 5: 'jj',
 6: ',',
 7: '.',
 8: 'cd',
 9: 'vbd',
 10: 'rb',
 11: 'vb',
 12: 'cc',
 13: 'to',
 14: 'vbn',
 15: 'vbz',
 16: 'prp',
 17: 'vbg',
 18: 'vbp',
 19: 'md',
 20: 'pos',
 21: 'prp$',
 22: '$',
 23: '``',
 24: "''",
 25: ':',
 26: 'wdt',
 27: 'jjr',
 28: 'nnps',
 29: 'wp',
 30: 'rp',
 31: 'jjs',
 32: 'wrb',
 33: 'rbr',
 34: '-rrb-',
 35: '-lrb-',
 36: 'ex',
 37: 'rbs',
 38: 'pdt',
 39: '#',
 40: 'wp$',
 41: 'ls',
 42: 'fw',
 43: 'uh',
 44: 'sym'}

## now create a dataset to train


### X is going to contain rows of sentences and each row contains sequence of word index
### Y is going to contain rows of sentences and each row contains sequence of tags index in one hot vector

In [20]:
from keras.preprocessing import sequence
from keras.utils import np_utils
import numpy as np
def  build_tensor(filename,numrecords,word2index,maxlen,make_category=False,num_classes=0):
    data=np.empty((numrecords),dtype=list)
    fin=open(filename,'r')
    i=0
    for line in fin:
        words=line.strip().lower().split()
        wids=[]
        for word in words:
            if word in word2index:
                wids.append(word2index[word])
            else:
                wids.append(word2index['UNK'])
        #print(wids)
        if make_category:
            data[i]=np_utils.to_categorical(wids,num_classes=num_classes)
        else:
            data[i]=wids
        i+=1
    fin.close()
    pdata=sequence.pad_sequences(data,maxlen=maxlen)
    return pdata

In [21]:
X = build_tensor(os.path.join(data_dir,"treebank_sents.txt"),s_num_records,s_word2index,max_seqlen)

In [22]:
Y =build_tensor(os.path.join(data_dir,"treebank_poss.txt"),t_num_records,t_word2index,max_seqlen,True,t_vocabsize)

In [23]:
X[0],Y[0]

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

## preparing test train module 

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

## preparing our model

In [25]:
from keras.layers.core import Activation, Dense, Dropout, RepeatVector,SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential

## Training Parameters

In [26]:
EMBED_SIZE = 128
HIDDEN_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 1
print(max_seqlen,t_vocabsize)
NUM_OF_ITERATION=1

250 46


## Training and Evaluating the model and then testing it

In [27]:
model = Sequential()
model.add(Embedding(s_vocabsize, EMBED_SIZE,
input_length=max_seqlen))
model.add(SpatialDropout1D(0.2))
model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(max_seqlen))
model.add(GRU(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))
#model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam",
metrics=["accuracy"])

for i in range(NUM_OF_ITERATION):
    print("Iteration: ",i+1)
    model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,validation_data=[X_test,Y_test])
    score, acc = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE)
    print("Test score: %.3f, accuracy: %.3f" % (score, acc))
    #testing
    #generating random seed
    index=np.random.randint(s_num_records*0.8)
    pred_text=[s_index2word[i] for i in X_train[index]]
    test_pred=X_train[index].reshape(1,250)
    #print(Y_test[index].shape)
    pred=model.predict(test_pred,verbose=0)[0]
    labels=list()
    for tags in pred:
        max_index=np.argmax(tags)
        labels.append(t_index2word[max_index])
    print(pred_text,'\n\n',labels)

Iteration:  1
Train on 3131 samples, validate on 783 samples
Epoch 1/1
Test score: 0.293, accuracy: 0.909
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD