In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df=pd.read_excel('data\\news.xlsx')

In [3]:
df.head()

Unnamed: 0,Headline,Short,Source,Time,Publish Date
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...,The New Indian Express,09:25:00,2017-03-26
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...,Outlook,22:18:00,2017-03-25
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a...",Hindustan Times,23:39:00,2017-03-25
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...,Livemint,23:08:00,2017-03-25
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...,YouTube,23:24:00,2017-03-25


In [4]:
df=df[['Headline','Short']]
df.head()

Unnamed: 0,Headline,Short
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a..."
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...


In [5]:
df.shape

(55104, 2)

In [23]:
text=df['Short']
summary=df['Headline']
text

0        The CBI on Saturday booked four former officia...
1        Chief Justice JS Khehar has said the Supreme C...
2        At least three people were killed, including a...
3        Mukesh Ambani-led Reliance Industries (RIL) wa...
4        TV news anchor Arnab Goswami has said he was t...
                               ...                        
55099    Tracking weak cues from the Asian markets, the...
55100    Amid growing concerns about China&#39;s econom...
55101    Pakistani Ghazal singer Ghulam Ali will soon m...
55102    The Islamic State (IS) has acknowledged the de...
55103    UK-based oil firm Cairn Energy on Tuesday said...
Name: Short, Length: 55104, dtype: object

In [24]:
summary=summary.apply(lambda x:'<start>' + x + '<stop>')
summary.head()

0    <start>4 ex-bank officials booked for cheating...
1    <start>Supreme Court to go paperless in 6 mont...
2    <start>At least 3 killed, 30 injured in blast ...
3    <start>Why has Reliance been barred from tradi...
4    <start>Was stopped from entering my own studio...
Name: Headline, dtype: object

In [25]:
summary[0]

'<start>4 ex-bank officials booked for cheating bank of ₹209 crore<stop>'

Filtering and Tokenizing

In [32]:
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'

In [33]:
doc_tokenizer=Tokenizer(oov_token='<oov>')
doc_tokenizer.fit_on_texts(text)
text_tokens=doc_tokenizer.texts_to_sequences(text)
print(text_tokens[0])
summary_tokenizer=Tokenizer(filters=filters,oov_token='<oov>')
summary_tokenizer.fit_on_texts(summary)
summary_tokens=summary_tokenizer.texts_to_sequences(summary)
print(summary_tokens[0])

[2, 1153, 9, 116, 1912, 152, 109, 171, 6, 11881, 180, 8, 209, 326, 12, 3907, 17737, 1467, 3712, 8, 1913, 28258, 55, 735, 3, 2, 61, 295, 180, 2, 236, 35, 11882, 242, 1626, 8, 1809, 20, 11881, 180, 9, 2, 1452, 6, 7463, 8, 13762, 1504, 293, 863, 39, 17738, 3738, 3, 2, 458, 914, 17, 2, 236, 1712]
[565, 199, 146, 968, 605, 6, 2518, 146, 7, 17635, 234]


In [44]:
inputs=text_tokens
targets=summary_tokens

Analysis

In [59]:
encoder_vocab_size = len(doc_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1
encoder_vocab_size, decoder_vocab_size

(76362, 44069)

In [45]:
document_lengths = pd.Series([len(x) for x in text])
summary_lengths=pd.Series([len(x) for x in summary])
print(document_lengths.max())
print(summary_lengths.max())

469
97


In [49]:
print(document_lengths.describe())
print(summary_lengths.describe())

count    55104.000000
mean       368.003049
std         26.235510
min        280.000000
25%        350.000000
50%        369.000000
75%        387.000000
max        469.000000
dtype: float64
count    55104.000000
mean        64.620282
std          7.267463
min         21.000000
25%         60.000000
50%         64.000000
75%         70.000000
max         97.000000
dtype: float64


In [53]:
encoder_maxlen=document_lengths.max()
decoder_maxlen=summary_lengths.max()
encoder_maxlen,decoder_maxlen

(469, 97)

Setting Requirements

In [55]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=encoder_maxlen, padding='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=decoder_maxlen, padding='post')

In [60]:
buffer_size=20000
batch_size=64

In [61]:
dataset=tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(buffer_size).batch(batch_size)

Positional Encoding

In [92]:
def positional_encoding(pos,i,d_model): # pos - max_sequence
    d_model=6
    pos=10
    i = tf.range(0, d_model, 2, dtype=tf.float32)
    denominator = tf.pow(10000, i / d_model)
    pos = tf.expand_dims(tf.range(pos, dtype=tf.float32), axis=1)
    even_PE = tf.sin(pos / denominator)
    # even_PE.shape
    odd_PE = tf.cos(pos / denominator)
    # odd_PE
    stacked = tf.stack([even_PE, odd_PE], axis=2)
    PE = tf.reshape(stacked, shape=(-1, d_model))
    return PE
# # Testing the function
# # result = positional_encoding(10, 0, 6)
# print(result.shape)
# print(result)
# positional_encoding(10,0,6)

Masking

In [1]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

In [2]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

Scaled Dot product

In [16]:
def scaled_dot_product(q,k,v,mask):
    matmul_qk=tf.matmul(q,k,transpose_b=True)  #perform q.kT
    dk = tf.cast(tf.shape(k)[-1], tf.float32)  #size of k in float
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) #axis=-1 for last dimension

    output = tf.matmul(attention_weights, v)
    return output, attention_weights

Multi Head Attention

In [None]:
# class MultiHeadAttention(tf.keras.layers.Layer):
#     def __init__(self,d_model,num_heads):
#         super(MultiHeadAttention).__init__() #class inherited using super function
#         self.d_model=d_model
#         self.num_heads=num_heads
#         assert d_model % num_heads
#         self.depth=d_model//num_heads
#         #create layers for query key and value
#         self.wq=tf.keras.layers.Dense(d_model) #query
#         self.wk=tf.keras.layers.Dense(d_model) #key
#         self.wv=tf.keras.layers.Dense(d_model) #value
#         self.dense=tf.keras.layers.Dense(d_model)  #output layer
#     def split_heads(self,x,batch_size):
#         self.x=tf.reshape(x,(batch_size,-1,self.num_heads,self.depth))#-1 such that size remains constant
#         return tf.transpose(x,perm=[0,2,1,3])
#     def call(self,v,k,q,mask):
#         q=self.wq(q)
#         k=self.wk(k)
#         v=self.wv(v)
#         q=self.split_heads(q,batch_size)
#         k=self.split_heads(k,batch_size)
#         v=self.split_heads(v,batch_size)
#         scaled_attention , attention_weights= scaled_dot_product(q,k,v,mask)
#         scaled_attention=tf.transpose()

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(x):
        super(Encoder,self).__init__()