In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, InputLayer, TimeDistributed
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split

import string, re

In [2]:
summary = pd.read_csv("./dataset/news_summary.csv", encoding='iso-8859-1')
raw = pd.read_csv("./dataset/news_summary_more.csv", encoding='iso-8859-1')

In [3]:
config = {'min_text_len':40,
          'max_text_len':60,
          'max_summary_len':30,
          'latent_dim' : 300,
          'embedding_dim' : 200}

In [4]:
df = pd.concat([raw, summary]).reset_index(drop=True)

print(f'Before filtering: {df.shape}')
df = df.loc[((df['text'].str.split(" ").str.len()>config['min_text_len']) & (df['text'].str.split(" ").str.len()<config['max_text_len']))].reset_index(drop=True)
print(f'After filtering: {df.shape}')
df.columns

Before filtering: (102915, 6)
After filtering: (54572, 6)


Index(['headlines', 'text', 'author', 'date', 'read_more', 'ctext'], dtype='object')

In [5]:
# ran_num = np.random.randint(1, 10000)
# print(f"Text: {df['text'][ran_num]}")

# print(f"Summary: {df['headlines'][ran_num]}")
# # print()
# print(f"Text length: {len(df['text'][ran_num].split())}") 
# print(f"Summary length: {len(df['headlines'].str.split()[1])}")

In [6]:
def text_strip(sentence):

  sentence = re.sub("(\\t)", " ", str(sentence)).lower()
  sentence = re.sub("(\\r)", " ", str(sentence)).lower()
  sentence = re.sub("(\\n)", " ", str(sentence)).lower()

  # Remove - if it occurs more than one time consecutively
  sentence = re.sub("(--+)", " ", str(sentence)).lower()

  # Remove . if it occurs more than one time consecutively
  sentence = re.sub("(\.\.+)", " ", str(sentence)).lower()

  # Remove the characters - <>()|&©ø"',;?~*! (special charcter)
  sentence = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", " ", str(sentence)).lower()

  # Remove \x9* in text
  sentence = re.sub(r"(\\x9\d)", " ", str(sentence)).lower()

  # Replace CM# and CHG# to CM_NUM
  sentence = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", "CM_NUM", str(sentence)).lower()

  # Remove punctuations at the end of a word
  sentence = re.sub("(\.\s+)", " ", str(sentence)).lower()
  sentence = re.sub("(\-\s+)", " ", str(sentence)).lower()
  sentence = re.sub("(\:\s+)", " ", str(sentence)).lower()

  # Remove multiple spaces
  sentence = re.sub("(\s+)", " ", str(sentence)).lower()
  
  return sentence

In [7]:
df.columns
df.text

0        Pakistani singer Rahat Fateh Ali Khan has deni...
1        India recorded their lowest ODI total in New Z...
2        Andhra Pradesh CM N Chandrababu Naidu has said...
3        Isha Ghosh, an 81-year-old member of Bharat Sc...
4        Filmmaker Karan Johar and actress Tabu turned ...
                               ...                        
54567    An investigation by India Today has unmasked a...
54568    The Kangana Ranaut, Shahid Kapoor and Saif Ali...
54569    A ticket collector on Thursday allegedly bit o...
54570    Aamir Khan, while talking about reality shows ...
54571    The Maharashtra government has initiated an in...
Name: text, Length: 54572, dtype: object

In [8]:
df['cleaned_text'] = df.text.apply(lambda x: text_strip(x))
df['cleaned_headlines'] = df.headlines.apply(lambda x: '_START_ '+ text_strip(x) + ' _END_')
df['cleaned_headlines'] = df['cleaned_headlines'].apply(lambda x: 'sostok ' + x + ' eostok')

df = df[((df.cleaned_text.str.split().str.len()<=config['max_text_len']) & (df.cleaned_headlines.str.split().str.len()<=(config['max_summary_len']+4)))].copy()
df = df.reset_index(drop=True)

df = df.drop(['text', 'headlines'], axis=1)
df = df.rename(columns = {'cleaned_text':'text', 'cleaned_headlines':'summary'})
                                      

In [9]:

X = df['text']
Y = df['summary']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)



In [10]:
def get_rare_words(text_col):

  text_tokenizer = Tokenizer()    
  text_tokenizer.fit_on_texts(list(text_col))

  thresh = 5

  cnt = 0
  tot_cnt = 0

  for key, value in text_tokenizer.word_counts.items():
      tot_cnt = tot_cnt + 1
      if value < thresh:
          cnt = cnt + 1

  print("% of rare words in vocabulary:",(cnt / tot_cnt) * 100)
  
  return cnt, tot_cnt

x_train_cnt, x_train_tot_cnt = get_rare_words(text_col=x_train)
x_train_cnt, x_train_tot_cnt

% of rare words in vocabulary: 64.15386975269716


(37641, 58673)

In [11]:
x_tokenizer = Tokenizer(num_words=x_train_tot_cnt - x_train_cnt) 

x_tokenizer.fit_on_texts(list(x_train))

x_tr_seq = x_tokenizer.texts_to_sequences(x_train) 
x_val_seq = x_tokenizer.texts_to_sequences(x_test)

x_tr = pad_sequences(x_tr_seq,  maxlen=config['max_text_len'], padding='post')
x_val = pad_sequences(x_val_seq, maxlen=config['max_text_len'], padding='post')

x_voc = x_tokenizer.num_words + 1

x_train[0]

'pakistani singer rahat fateh ali khan has denied receiving any notice from the enforcement directorate over allegedly smuggling foreign currency out of india it would have been better if the authorities would have served the notice first if any and then publicised this reads a press release issued on behalf of rahat the statement further called the allegation bizarre .'