In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import numpy as np

ds_train, ds_test = tfds.load('ag_news_subset').values()

In [None]:
batch_size = 16
embed_size = 64
rnn_size = 16

In [None]:
vocab_size = 20000

vectorizer = keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=vocab_size,
    input_shape=(1,))

@tf.function
def extract_title(x):
    return x['title']

@tf.function
def extract_text(x):
    return x['title']+' '+x['description']


@tf.function
def tupelize_title(x):
    return (extract_title(x),x['label'])

@tf.function
def tupelize(x):
    return (x['title']+' '+x['description'],x['label'])


print('Training vectorizer')
vectorizer.adapt(ds_train.take(10000).map(extract_text))


Training vectorizer


In [None]:
vectorizer(['Hello, world','I like to see this lesson'])

<tf.Tensor: shape=(2, 6), dtype=int64, numpy=
array([[   1,   60,    0,    0,    0,    0],
       [ 844, 1101,    2,  699,  199,    1]])>

In [None]:
model = keras.models.Sequential([ # mb x 1
    vectorizer, # mb x max_len
    keras.layers.Embedding(vocab_size,embed_size, mask_zero=True), # mb x max_len x emb_size
    keras.layers.SimpleRNN(rnn_size), # mb x rnn_size
    keras.layers.Dense(4,activation='softmax') # mb x 4
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, None, 64)          1280000   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 16)                1296      
                                                                 
 dense_1 (Dense)             (None, 4)                 68        
                                                                 
Total params: 1,281,364
Trainable params: 1,281,364
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
    loss='sparse_categorical_crossentropy',
    metrics=['acc'], 
    optimizer='adam')
model.fit(
    ds_train.map(tupelize_title).batch(batch_size),
    validation_data=ds_test.map(tupelize_title).batch(batch_size))



<keras.callbacks.History at 0x7f36d076ae30>

In [None]:
res = model.layers[1](vectorizer(['Hello','I like keras']))
res.shape

TensorShape([2, 3, 64])

<tf.Tensor: shape=(64,), dtype=float32, numpy=
array([-9.38544050e-03, -7.55790714e-03, -5.17520607e-02, -4.94242124e-02,
       -5.28947376e-02,  1.39291193e-02, -1.81917904e-03,  5.85682355e-02,
        2.41537392e-02,  4.82625291e-02, -2.82874666e-02,  2.81700622e-02,
        1.20397501e-01, -1.02029713e-02, -1.01893052e-01,  4.79868473e-03,
        6.83888374e-03, -1.57974325e-02, -6.52266620e-03,  8.75837170e-03,
       -1.81152765e-02,  2.22952403e-02,  1.65232457e-02,  1.64617912e-03,
       -6.42969161e-02,  4.22668420e-02,  1.15949390e-02, -2.46598874e-03,
        2.52057984e-02, -8.43471363e-02, -4.92090039e-05,  2.67776269e-02,
        1.45339258e-02,  5.54866269e-02,  1.74197480e-02, -7.66643584e-02,
       -5.05497940e-02,  3.27403583e-02,  5.21041974e-02,  3.05327419e-02,
        4.19781283e-02,  2.44655740e-02,  1.04001105e-01,  7.61643276e-02,
        4.25256416e-02,  3.19033787e-02,  2.49435883e-02,  5.13586551e-02,
        3.23457345e-02,  2.67045274e-02,  6.43391535e

## LSTM

In [None]:
model = keras.models.Sequential([ # mb x 1
    vectorizer, # mb x max_len
    keras.layers.Embedding(vocab_size,embed_size, mask_zero=True), # mb x max_len x emb_size
    keras.layers.LSTM(rnn_size), # mb x rnn_size
    keras.layers.Dense(4,activation='softmax') # mb x 4
])
model.summary()
model.compile(
    loss='sparse_categorical_crossentropy',
    metrics=['acc'], 
    optimizer='adam')
model.fit(
    ds_train.map(tupelize_title).batch(batch_size),
    validation_data=ds_test.map(tupelize_title).batch(batch_size))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_2 (Embedding)     (None, None, 64)          1280000   
                                                                 
 lstm (LSTM)                 (None, 16)                5184      
                                                                 
 dense_2 (Dense)             (None, 4)                 68        
                                                                 
Total params: 1,285,252
Trainable params: 1,285,252
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x7f36d094cfd0>

In [None]:
model = keras.models.Sequential([ # mb x 1
    vectorizer, # mb x max_len
    keras.layers.Embedding(vocab_size,embed_size, mask_zero=True), # mb x max_len x emb_size
    keras.layers.LSTM(rnn_size,return_sequences=True), # mb x max_len x rnn_size
    keras.layers.LSTM(rnn_size), # mb x rnn_size
    keras.layers.Dense(4,activation='softmax') # mb x 4
])
model.summary()
model.compile(
    loss='sparse_categorical_crossentropy',
    metrics=['acc'], 
    optimizer='adam')
model.fit(
    ds_train.map(tupelize_title).batch(batch_size),
    validation_data=ds_test.map(tupelize_title).batch(batch_size))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_3 (Embedding)     (None, None, 64)          1280000   
                                                                 
 lstm_1 (LSTM)               (None, None, 16)          5184      
                                                                 
 lstm_2 (LSTM)               (None, 16)                2112      
                                                                 
 dense_3 (Dense)             (None, 4)                 68        
                                                                 
Total params: 1,287,364
Trainable params: 1,287,364
Non-trainable params: 0
____________________________________________

<keras.callbacks.History at 0x7f36d0a34f10>

In [None]:
model = keras.models.Sequential([ # mb x 1
    vectorizer, # mb x max_len
    keras.layers.Embedding(vocab_size,embed_size, mask_zero=True), # mb x max_len x emb_size
    keras.layers.Bidirectional(keras.layers.LSTM(rnn_size)), # mb x (rnn_size * 2)
    keras.layers.Dense(4,activation='softmax') # mb x 4
])
model.summary()
model.compile(
    loss='sparse_categorical_crossentropy',
    metrics=['acc'], 
    optimizer='adam')
model.fit(
    ds_train.map(tupelize_title).batch(batch_size),
    validation_data=ds_test.map(tupelize_title).batch(batch_size))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_4 (Embedding)     (None, None, 64)          1280000   
                                                                 
 bidirectional (Bidirectiona  (None, 32)               10368     
 l)                                                              
                                                                 
 dense_4 (Dense)             (None, 4)                 132       
                                                                 
Total params: 1,290,500
Trainable params: 1,290,500
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x7f363ace5300>

In [None]:
model = keras.models.Sequential([ # mb x 1
    vectorizer, # mb x max_len
    keras.layers.Embedding(vocab_size,embed_size, mask_zero=True), # mb x max_len x emb_size
    keras.layers.Bidirectional(keras.layers.LSTM(rnn_size,return_sequences=True)), # mb x (rnn_size * 2)
    keras.layers.Bidirectional(keras.layers.LSTM(rnn_size)), # mb x (rnn_size * 2)
    keras.layers.Dense(4,activation='softmax') # mb x 4
])
model.summary()
model.compile(
    loss='sparse_categorical_crossentropy',
    metrics=['acc'], 
    optimizer='adam')
model.fit(
    ds_train.map(tupelize).batch(batch_size),
    validation_data=ds_test.map(tupelize).batch(batch_size))

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, None)             0         
 ectorization)                                                   
                                                                 
 embedding_5 (Embedding)     (None, None, 64)          1280000   
                                                                 
 bidirectional_1 (Bidirectio  (None, None, 32)         10368     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 32)               6272      
 nal)                                                            
                                                                 
 dense_5 (Dense)             (None, 4)                 132       
                                                      

<keras.callbacks.History at 0x7f36407e9000>

## NER

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('ner_dataset.zip',encoding='unicode-escape',compression='zip')
df.head(15)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


In [None]:
tags = df.Tag.unique()
tags

array(['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat'], dtype=object)

In [None]:
id2tag = dict(enumerate(tags))
tag2id = { v : k for k,v in id2tag.items() }

id2tag[0]

'O'

In [None]:
vocab = set(df['Word'].apply(lambda x: x.lower()))
id2word = { i+2 : v for i,v in enumerate(vocab) }
id2word[0] = '<PAD>'
id2word[1] = '<UNK>'
vocab.add('<UNK>')
vocab.add('<PAD>')
word2id = { v : k for k,v in id2word.items() }
vocab_size = len(id2word)

In [None]:
vocab_size

31819

In [None]:
df['Sent'] = df['Sentence #'].fillna(method='ffill')
df

Unnamed: 0,Sentence #,Word,POS,Tag,Sent
0,Sentence: 1,Thousands,NNS,O,Sentence: 1
1,,of,IN,O,Sentence: 1
2,,demonstrators,NNS,O,Sentence: 1
3,,have,VBP,O,Sentence: 1
4,,marched,VBN,O,Sentence: 1
...,...,...,...,...,...
1048570,,they,PRP,O,Sentence: 47959
1048571,,responded,VBD,O,Sentence: 47959
1048572,,to,TO,O,Sentence: 47959
1048573,,the,DT,O,Sentence: 47959


In [None]:
ddf = df.groupby('Sent').agg({ 'Word' : (lambda x : x), 'Tag' : (lambda x : x) }).reset_index()
ddf

Unnamed: 0,Sent,Word,Tag
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."
...,...,...,...
47954,Sentence: 9995,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O..."
47955,Sentence: 9996,"[On, Thursday, ,, Iranian, state, media, publi...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B..."
47956,Sentence: 9997,"[Following, Iran, 's, disputed, June, 12, elec...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ..."
47957,Sentence: 9998,"[Since, then, ,, authorities, have, held, publ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [None]:
ddf['WID'] = ddf['Word'].apply(lambda x : [ word2id[t.lower()] for t in x if t.lower() in vocab ])
ddf['TID'] = ddf['Tag'].apply(lambda x : [ tag2id[t] for t in x])
ddf

Unnamed: 0,Sent,Word,Tag,WID,TID
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[5724, 9545, 29799, 27757, 16121, 6373, 23813,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[24629, 21485, 27165, 28505, 15745, 16842, 120...","[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[24529, 2294, 26259, 24766, 11366, 22063, 1879...","[0, 0, 7, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 5, ..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]","[28505, 15101, 4453, 1941, 11868, 22448, 6219,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[1685, 19933, 22249, 23544, 2467, 2479, 15351,...","[1, 0, 0, 3, 10, 0, 7, 0, 1, 0, 2, 0, 2, 0, 0,..."
...,...,...,...,...,...
47954,Sentence: 9995,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O...","[2840, 23526, 30261, 29175, 28496, 3784, 2479,...","[0, 0, 0, 3, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
47955,Sentence: 9996,"[On, Thursday, ,, Iranian, state, media, publi...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B...","[11942, 27326, 16870, 24629, 30462, 28476, 222...","[0, 7, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, ..."
47956,Sentence: 9997,"[Following, Iran, 's, disputed, June, 12, elec...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ...","[29379, 18742, 9974, 9266, 16835, 30618, 10751...","[0, 1, 0, 0, 7, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
47957,Sentence: 9998,"[Since, then, ,, authorities, have, held, publ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[5668, 13256, 16870, 28861, 27757, 27716, 2094...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
ddf[ddf[['WID','TID']].apply( lambda x : len(x[0])!=len(x[1]), axis=1)]

Unnamed: 0,Sent,Word,Tag,WID,TID
32131,Sentence: 38917,...,O,"[4941, 4941, 4941]",[0]
46197,Sentence: 8412,The,O,"[31184, 1198]",[0]


In [None]:
ddf['length'] = ddf['WID'].apply(len)
ddfx = ddf[ddf[['WID','TID']].apply( lambda x : len(x[0])==len(x[1]), axis=1)].sort_values(by='length')
ddfx

Unnamed: 0,Sent,Word,Tag,WID,TID,length
42195,Sentence: 4810,"[Janice, Karpinski]","[B-per, I-per]","[27799, 6838]","[3, 10]",2
6612,Sentence: 1595,"[John, Garang]","[B-per, I-per]","[9307, 27337]","[3, 10]",2
33612,Sentence: 40249,"[John, Garang]","[B-per, I-per]","[9307, 27337]","[3, 10]",2
33194,Sentence: 39874,"[Bermet, Akayeva]","[B-per, I-per]","[22998, 10170]","[3, 10]",2
30105,Sentence: 37093,"[Questions, ?]","[O, O]","[21096, 18141]","[0, 0]",2
...,...,...,...,...,...,...
40598,Sentence: 46537,"[Renewed, activity, in, the, mining, sector, ,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[14842, 1877, 18797, 6681, 26898, 20537, 16870...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",70
12410,Sentence: 21167,"[The, government, has, encouraged, exporters, ...","[O, O, O, O, O, O, O, O, O, O, O, O, B-geo, I-...","[6681, 31055, 3784, 29832, 3693, 16842, 13339,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, ...",72
33506,Sentence: 40153,"[According, to, the, U.S., Department, of, Com...","[O, O, O, B-org, I-org, I-org, I-org, O, B-geo...","[14290, 16842, 6681, 23323, 3275, 9545, 15390,...","[0, 0, 0, 5, 6, 6, 6, 0, 1, 0, 0, 0, 7, 12, 12...",73
26092,Sentence: 33481,"[On, his, departure, the, Farmer, grievously, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[11942, 4413, 18399, 6681, 23644, 7141, 14822,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",81


In [None]:
ddfx['WIDnp'] = ddfx['WID'].apply(lambda x : tf.constant(x))
ddfx['TIDnp'] = ddfx['TID'].apply(lambda x : tf.constant(x))

In [None]:
maxlen = 104
X = keras.preprocessing.sequence.pad_sequences(ddfx['WID'])
Y = keras.preprocessing.sequence.pad_sequences(ddfx['TID'])

In [None]:
num_classes = len(tags)
model = keras.models.Sequential([ # mb x max_len
    keras.layers.Embedding(vocab_size,300,mask_zero=True), # mb x max_len x 300
    keras.layers.Bidirectional(keras.layers.LSTM(100,return_sequences=True)), # mb x max_len x 100
    keras.layers.Bidirectional(keras.layers.LSTM(100,return_sequences=True)), # mb x max_len x 100
    keras.layers.TimeDistributed(keras.layers.Dense(num_classes,activation='softmax')) # mb x max_len x num_classes
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['acc'])
model.fit(X,Y,shuffle=True)



<keras.callbacks.History at 0x7f35b2a5a680>

In [None]:
sent = 'John Smith went to Paris to attend a conference in cancer development institute'
words = sent.lower().split()
v = [word2id[x] for x in words]
res = model.predict([v])[0]



In [None]:
for w,c in zip(words,res.argmax(axis=1)):
  print(f"{w} -> {id2tag[c]}")

john -> B-per
smith -> I-per
went -> O
to -> O
paris -> B-geo
to -> O
attend -> O
a -> O
conference -> O
in -> O
cancer -> O
development -> O
institute -> O


## Generative RNN

In [None]:
def extract_text(x):
    return x['title']+' '+x['description']

def tupelize(x):
    return (extract_text(x),x['label'])

tokenizer = keras.preprocessing.text.Tokenizer(char_level=True,lower=False)
tokenizer.fit_on_texts([x['title'].numpy().decode('utf-8') for x in ds_train])

In [None]:
eos_token = len(tokenizer.word_index)+1
tokenizer.word_index['<eos>'] = eos_token
vocab_size = eos_token + 1

In [None]:
tokenizer.texts_to_sequences(['Hello, world!'])

[[48, 2, 10, 10, 5, 44, 1, 25, 5, 8, 10, 13, 78]]

In [None]:
def title_batch(x):
    x = [t.numpy().decode('utf-8') for t in x]
    z = tokenizer.texts_to_sequences(x)
    z = tf.keras.preprocessing.sequence.pad_sequences(z)
    return tf.one_hot(z,vocab_size), tf.one_hot(tf.concat([z[:,1:],tf.constant(eos_token,shape=(len(z),1))],axis=1),vocab_size)

def title_batch_fn(x):
    x = x['title']
    a,b = tf.py_function(title_batch,inp=[x],Tout=(tf.float32,tf.float32))
    return a,b



In [None]:
model = keras.models.Sequential([
    keras.layers.Masking(input_shape=(None,vocab_size)),
    keras.layers.LSTM(128,return_sequences=True,input_shape=(None,vocab_size)),
    keras.layers.Dense(vocab_size,activation='softmax')
])

model.summary()
model.compile(loss='categorical_crossentropy')

model.fit(ds_train.batch(8).map(title_batch_fn))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, None, 84)          0         
                                                                 
 lstm_1 (LSTM)               (None, None, 128)         109056    
                                                                 
 dense_1 (Dense)             (None, None, 84)          10836     
                                                                 
Total params: 119,892
Trainable params: 119,892
Non-trainable params: 0
_________________________________________________________________


TypeError: ignored

In [None]:
reverse_map = {val:key for key, val in tokenizer.word_index.items()}

def decode(x):
    return ''.join([reverse_map[t] for t in x])

def generate(model,size=100,start='Today '):
        inp = tokenizer.texts_to_sequences([start])[0]
        chars = inp
        for i in range(size):
            out = model(tf.expand_dims(tf.one_hot(inp,vocab_size),0))[0][-1]
            nc = tf.argmax(out)
            if nc==eos_token:
                break
            chars.append(nc.numpy())
            inp = inp+[nc]
        return decode(chars)
    
generate(model)

In [None]:
sampling_callback = keras.callbacks.LambdaCallback(
  on_epoch_end = lambda batch, logs: print(generate(model))
)

model.fit(ds_train.batch(8).map(title_batch_fn),callbacks=[sampling_callback],epochs=3)

In [None]:
def generate_soft(model,size=100,start='Today ',temperature=1.0):
        inp = tokenizer.texts_to_sequences([start])[0]
        chars = inp
        for i in range(size):
            out = model(tf.expand_dims(tf.one_hot(inp,vocab_size),0))[0][-1]
            probs = tf.exp(tf.math.log(out)/temperature).numpy().astype(np.float64)
            probs = probs/np.sum(probs)
            nc = np.argmax(np.random.multinomial(1,probs,1))
            if nc==eos_token:
                break
            chars.append(nc)
            inp = inp+[nc]
        return decode(chars)

words = ['Today ','On Sunday ','Moscow, ','President ','Little red riding hood ']
    
for i in [0.3,0.8,1.0,1.3,1.8]:
    print(f"\n--- Temperature = {i}")
    for j in range(5):
        print(generate_soft(model,size=300,start=words[j],temperature=i))