In [82]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional,LSTM,Dense,Embedding,Dropout
from time import time
from tensorflow.python.keras.callbacks import TensorBoard
from evaluation import precision_recall_f1

In [4]:
def read_data(file_path):
    tokens = []
    tags = []

    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else: 
            token,tag = line.split()
            if(token.startswith('http')):
                token = token.replace(token,'<URL>')
            if(token.startswith('@')):
                token = token.replace(token,'<USR>')

            tweet_tokens.append(token)
            tweet_tags.append(tag)
    return tokens,tags


In [5]:
train_token,train_tag = read_data('data/train.txt')
test_token,test_tag = read_data('data/test.txt')
val_token,val_tag = read_data('data/validation.txt')

In [69]:
for i in range(1):
    for token,tag in zip(train_token[i],train_tag[i]):
        print('%s\t%s'%(token,tag))

RT	O
<USR>	O
:	O
Online	O
ticket	O
sales	O
for	O
Ghostland	B-musicartist
Observatory	I-musicartist
extended	O
until	O
6	O
PM	O
EST	O
due	O
to	O
high	O
demand	O
.	O
Get	O
them	O
before	O
they	O
sell	O
out	O
...	O


In [7]:
tokenizer = Tokenizer(oov_token = '<UNK>')
tokenizer.fit_on_texts(train_token+val_token)

In [8]:
word_index = tokenizer.word_index

In [9]:
def token_to_seq(list_file):
    ret_val = []
    for s in list_file:
        helper = []
        for w in s:
            helper.append(w)
        ret_val.append(' '.join(helper))
    return ret_val

In [10]:
train_seq = token_to_seq(train_token)
val_seq = token_to_seq(val_token)
test_seq = token_to_seq(test_token)

In [11]:
maxlen = np.max([len(w) for w in train_seq+val_seq])

In [12]:
train_seq = tokenizer.texts_to_sequences(train_seq)
val_seq = tokenizer.texts_to_sequences(val_seq)
# train_seq = tokenizer.texts_to_sequences(train_seq)

In [13]:
train_seq = pad_sequences(train_seq,maxlen = maxlen,padding='post')
val_seq = pad_sequences(val_seq,maxlen = maxlen,padding='post')
# test_seq = pad_sequences(test_seq,maxlen = maxlen,padding='post')

In [14]:
total_words = len(word_index) + 1
####### input sequence done ##### now go for output

In [15]:
def tag_to_seq(tag_file):
    tag = []
    for s in tag_file:
        helper = []
        for w in s:
            if(w.startswith('B') or w.startswith('I')):
                w = 1
            else:
                w = 0
            helper.append(w)
        tag.append(helper)
    return tag

In [16]:
train_tag_seq = tag_to_seq(train_tag)
val_tag_seq = tag_to_seq(val_tag)
# train_tag_seq = tag_to_seq(train_tag)val_seq
# max_tag_len = np.max([len(w) for w in train_tag_seq+val_tag_seq])
train_tag_seq = pad_sequences(train_tag_seq,maxlen = maxlen,padding = 'post')
val_tag_seq = pad_sequences(val_tag_seq,maxlen = maxlen,padding = 'post')


In [37]:
def model():
    model = Sequential()
    model.add(Embedding(total_words,200,input_length = maxlen))
    model.add(Bidirectional(LSTM(200)))
    model.add(Dropout(0.5))
    model.add(Dense(maxlen,activation='softmax'))    
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    return model

In [38]:
model = model()
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 182, 200)          3386600   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 400)               641600    
_________________________________________________________________
dropout_4 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 182)               72982     
Total params: 4,101,182
Trainable params: 4,101,182
Non-trainable params: 0
_________________________________________________________________


In [39]:
h = model.fit(train_seq,train_tag_seq,validation_data=(val_seq,val_tag_seq),batch_size=32,epochs=4,callbacks = [tensorboard])

Train on 5795 samples, validate on 724 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [20]:
test_seq = tokenizer.texts_to_sequences(test_seq)
test_seq = pad_sequences(test_seq,maxlen = maxlen,padding = 'post')
test_tag_seq = tag_to_seq(test_tag)
test_tag_seq = pad_sequences(test_tag_seq,maxlen = maxlen,padding='post')

In [21]:
test_predict = model.predict(test_seq)

In [26]:
loss,acc = model.evaluate(test_seq,test_tag_seq,verbose=1)



In [30]:
from time import time
from tensorflow.python.keras.callbacks import TensorBoard

In [70]:
my_seq = np.array(['hey satya how are you, btw sishrut told me to say hi to you'])

In [71]:
my_seq = np.array(tokenizer.texts_to_sequences(my_seq))

In [72]:
my_seq

array([[ 370,    1,  117,   59,   16, 2448,    1,  585,   40,    8,  178,
         534,    8,   16]])

In [73]:
my_seq = pad_sequences(my_seq,maxlen=maxlen,padding='post')

In [74]:
my_Seq_val = model.predict(my_seq)