In [None]:
!pip install pandas

import tensorflow as tf


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
import csv



In [None]:
# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(w):
  w=str(w)

  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()

  w = w[:200]

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [None]:
sentence = "May I borrow this book?"
print(preprocess_sentence(sentence))

<start> may i borrow this book ? <end>


In [None]:
# 1. Remove the accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [Text, Title]
def create_dataset():
  df = pd.read_csv(os.getcwd() + "/news_summary.csv", encoding = "ISO-8859-1")

  print(len(df.headlines))

  arr = []
  
  for title, text in zip(df.headlines, df.text):
    arr.append([preprocess_sentence(text), preprocess_sentence(title)])

  return zip(*arr)

In [None]:
text, title = create_dataset()

  if self.run_code(code, result):


4515


In [None]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                         padding='post')

  return tensor, lang_tokenizer

In [None]:
def load_dataset():
  # creating cleaned input, output pairs
  text, title = create_dataset()

  input_tensor, text_tokenizer = tokenize(text)
  target_tensor, title_tokenizer = tokenize(title)

  return input_tensor, target_tensor, text_tokenizer, title_tokenizer

In [None]:
# Try experimenting with the size of that dataset
# num_examples = 100
input_tensor, target_tensor, text_tokenizer, title_tokenizer = load_dataset()

# Calculate max_length of the target tensors
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

  exec(code_obj, self.user_global_ns, self.user_ns)


4515


In [None]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Show length
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

3612 3612 903 903


In [None]:
def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
# print ("index to word mapping")
convert(text_tokenizer, input_tensor_train[0])

3 ----> <start>
36 ----> new
24 ----> delhi
2 ----> ,
166 ----> jul
34 ----> pti
1 ----> the
553 ----> railway
325 ----> ministry
28 ----> today
2252 ----> cancelled
5 ----> a
9285 ----> caterers
3320 ----> contract
23 ----> after
5 ----> a
1072 ----> passenger
147 ----> found
5 ----> a
538 ----> dead
6312 ----> lizard
7 ----> in
22 ----> his
6313 ----> vegetable
6314 ----> biryani
1228 ----> served
11 ----> on
5 ----> a
24 ----> delhi
939 ----> bound
451 ----> train
27 ----> from
4722 ----> howrah
2 ----> ,
5 ----> a
325 ----> ministry
1103 ----> spoke
4 ----> <end>


### Create a tf.data dataset

In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 50
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 200
units = 128
vocab_inp_size = len(text_tokenizer.word_index)+1
vocab_tar_size = len(title_tokenizer.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
example_input_batch, example_target_batch = next(iter(dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([50, 54]), TensorShape([50, 34]))

In [None]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (50, 54, 128)
Encoder Hidden state shape: (batch size, units) (50, 128)


In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # query hidden state shape == (batch_size, hidden size)
    # query_with_time_axis shape == (batch_size, 1, hidden size)
    # values shape == (batch_size, max_len, hidden size)
    # we are doing this to broadcast addition along the time axis to calculate the score
    query_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(query_with_time_axis) + self.W2(values)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(24)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (50, 128)
Attention weights shape: (batch_size, sequence_length, 1) (50, 54, 1)


In [None]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (50, 8407)


In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## Checkpoints (Object-based saving)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)



In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([title_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

EPOCHS = 200
loss_arr = []

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 50 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 50 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  # checkpoint.save(file_prefix = checkpoint_prefix) 

  loss_arr.append(total_loss / steps_per_epoch)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))




Epoch 1 Batch 0 Loss 1.1005
Epoch 1 Batch 50 Loss 1.0485
Epoch 1 Loss 0.9905
Time taken for 1 epoch 41.40394306182861 sec

Epoch 2 Batch 0 Loss 0.9417
Epoch 2 Batch 50 Loss 1.0179
Epoch 2 Loss 0.8744
Time taken for 1 epoch 7.045527935028076 sec

Epoch 3 Batch 0 Loss 0.8013
Epoch 3 Batch 50 Loss 0.7370
Epoch 3 Loss 0.7857
Time taken for 1 epoch 7.019021987915039 sec

Epoch 4 Batch 0 Loss 0.6994
Epoch 4 Batch 50 Loss 0.8771
Epoch 4 Loss 0.7130
Time taken for 1 epoch 7.023274183273315 sec

Epoch 5 Batch 0 Loss 0.7121
Epoch 5 Batch 50 Loss 0.8456
Epoch 5 Loss 0.6601
Time taken for 1 epoch 6.954793930053711 sec

Epoch 6 Batch 0 Loss 0.5111
Epoch 6 Batch 50 Loss 0.5822
Epoch 6 Loss 0.6112
Time taken for 1 epoch 7.003277540206909 sec

Epoch 7 Batch 0 Loss 0.4613
Epoch 7 Batch 50 Loss 0.5021
Epoch 7 Loss 0.5715
Time taken for 1 epoch 6.918026447296143 sec

Epoch 8 Batch 0 Loss 0.5399
Epoch 8 Batch 50 Loss 0.4337
Epoch 8 Loss 0.5382
Time taken for 1 epoch 6.956732273101807 sec

Epoch 9 Batch 0 

KeyboardInterrupt: ignored

In [None]:
plt.plot(loss_arr)
plt.ylabel('loss')
plt.show()

In [None]:
print(text_tokenizer.word_index)

def evaluate(sentence):
  attention_plot = np.zeros((max_length_targ, max_length_inp))

  sentence = preprocess_sentence(sentence)

  inputs = []
  # inputs = [text_tokenizer.word_index[i] for i in sentence.split(' ')]
 
  for i in sentence.split(' '):
    try:
      inputs.append(text_tokenizer.word_index[i])
    except:
      inputs.append(0)

  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=max_length_inp,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([title_tokenizer.word_index['<start>']], 0)

  for t in range(max_length_targ):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += title_tokenizer.index_word[predicted_id] + ' '

    if title_tokenizer.index_word[predicted_id] == '<end>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot



In [None]:
# function for plotting the attention weights
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

In [None]:
def generate_summary(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('')
  print('Generated summary')
  print(result)

  # attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  # plot_attention(attention_plot, sentence.split(' '), result.split(' '))

## Restore the latest checkpoint and test

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ff8b3b9ec90>

In [None]:
generate_summary("The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7. In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,? the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ? one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ? were issued by the Daman and Diu administration a day apart. The circular was withdrawn through a one-line order issued late in the evening by the UT?s department of personnel and administrative reforms.?The circular is ridiculous. There are sensitivities involved. How can the government dictate who I should tie rakhi to? We should maintain the professionalism of a workplace? an official told Hindustan Times earlier in the day. She refused to be identified.The notice was issued on Daman and Diu administrator and former Gujarat home minister Praful Kodabhai Patel?s direction, sources said.Rakshabandhan, a celebration of the bond between brothers and sisters, is one of several Hindu festivities and rituals that are no longer confined of private, family affairs but have become tools to push politic al ideologies.In 2014, the year BJP stormed to power at the Centre, Rashtriya Swayamsevak Sangh (RSS) chief Mohan Bhagwat said the festival had ?national significance? and should be celebrated widely ?to protect Hindu culture and live by the values enshrined in it?. The RSS is the ideological parent of the ruling BJP.Last year, women ministers in the Modi government went to the border areas to celebrate the festival with soldiers. A year before, all cabinet ministers were asked to go to their constituencies for the festival.")
print('')
print('Actual summary:-')
print('Madhesi Morcha withdraws support to Nepalese government')

Input: <start> the daman and diu administration on wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on so <end>

Generated summary
rakshabandhan compulsory for employees in daman and diu <end> 

Actual summary:-
Madhesi Morcha withdraws support to Nepalese government


In [None]:
generate_summary("India recorded their lowest ODI total in New Zealand after getting all out for 92 runs in 30.5 overs in the fourth ODI at Hamilton on Thursday. Seven of India's batsmen were dismissed for single-digit scores, while their number ten batsman Yuzvendra Chahal top-scored with 18*(37). India's previous lowest ODI total in New Zealand was 108.")
print('')
print('Actual summary:-')
print('India get all out for 92, their lowest ODI total in New Zealand')

Input: <start> india recorded their lowest odi total in new zealand after getting all out for runs in . overs in the fourth odi at hamilton on thursday . seven of india s batsmen were dismissed for single digit scor <end>

Generated summary
trump s daughter is the theatres <end> 

Actual summary:-
India get all out for 92, their lowest ODI total in New Zealand


In [None]:
generate_summary("Weeks after ex-CBI Director Alok Verma told the Department of Personnel and Training to consider him retired, the Home Ministry asked him to join work on the last day of his fixed tenure as Director on Thursday. The ministry directed him to immediately join as DG, Fire Services, the post he was transferred to after his removal as CBI chief.")

print('')
print('Actual summary:-')
print('kerry to go to paris in gesture of sympathy ')

Input: <start> weeks after ex cbi director alok verma told the department of personnel and training to consider him retired , the home ministry asked him to join work on the last day of his fixed tenure as director  <end>

Generated summary
main paat shot dead at rbi <end> 

Actual summary:-
kerry to go to paris in gesture of sympathy 


In [None]:
generate_summary('kaydee king kaydeeking november , the lesson from tonight s dem losses time for democrats to start listening to the voters . stop running the same establishment candidates . people for bernie people bernie november , if dems didn t want a tight race they shouldn t have worked against bernie . walker bragman walkerbragman november , new york times columnist paul krugman , who was one of hillary clinton s most outspoken surrogates during the contentious democratic primary , blamed clinton s poor performance on green party candidate jill stein , who has so far received a negligible number of votes nationally , saying stein was the ralph nader of in preventing a clinton victory . the account berniesteachers threw krugman s analysis back in his face . your candidate was the issue . take responsibility . https t . co khyouusrfs teachers for bernie berniesteachers november , ana navarro , a republican who recently endorsed hillary clinton , summed up the preposterous nature of the presidential election in this tweet gop nominated the only damn candidate who could lose to hillary clinton . democrats nominated the only damn candidate who could lose to trump ana navarro ananavarro november , popular left wing facebook page the other , which was pro sanders during the primary , responded to trump s surge by simply posting a meme of sanders face with the text all this could ve been avoided . thanks for nothing , dnc ! the meme has been shared almost , times in less than an hour posted by the other on tuesday , november , while bernie sanders endorsed hillary clinton just before the democratic national convention in july , many of his supporters remained adamant in their refusal to support the dnc anointed candidate , pointing to wikileaks revelations that top officials at the dnc had been working behind the scenes to tip the scales in clinton s favor by coordinating with media figures to circulate anti sanders narratives . rather than attribute a potential trump presidency to the gop nominee s perceived popularity among voters , the closeness of this election could be credited to hillary clinton s unfavorable ratings . according to realclearpolitics , anywhere between and percent of voters had a negative opinion of the democratic nominee . as of pm eastern , florida , michigan , pennsylvania , and wisconsin remain too close to call . clinton has electoral votes to trump s . zach cartwright is an activist and author from richmond , virginia . he enjoys writing about politics , government , and the media . send him an email at email protected ')

print('')
print('Actual summary:-')
print('bernie supporters on twitter erupt in anger against the dnc we tried to warn you !')

Input: <start> kaydee king kaydeeking november , the lesson from tonight s dem losses time for democrats to start listening to the voters . stop running the same establishment candidates . people for bernie people b <end>

Generated summary
i don t mind govt at woman kills pregnant of it is a town from moving army kills pakistani actress arrested for ? crore in chennai so scared to book am a man arrested 

Actual summary:-
bernie supporters on twitter erupt in anger against the dnc we tried to warn you !


In [None]:
generate_summary('On a day when two of its non-playing members, including a coach, tested positive for Covid-19, Chennai Super Kings (CSK) informed the Board of Control for Cricket in India (BCCI) that they won’t play their next match against Rajasthan Royals in New Delhi on Wednesday. It is learnt that CSK informed the BCCI that they could step back on the field only once all players who came in contact with the two people who tested positive cleared three tests over a six day period, as per Covid-19 standard operating procedure (SOP) of the IPL. The game between Kolkata Knight Riders and Royal Challengers Bangalore in Ahmedabad was rescheduled on Monday after two KKR players, spinner Varun Chakravarty and medium pacer Sandeep Warrier, tested positive for Covid-19.')
    
    
print('')
print('Actual summary:-')
print('the battle of new york why this primary matters')

Input: <start> on a day when two of its non playing members , including a coach , tested positive for covid , chennai super kings csk informed the board of control for cricket in india bcci that they won t play thei <end>

Generated summary
only boost it <end> 

Actual summary:-
the battle of new york why this primary matters
