# 1. Data Preprocessing:

● Load the provided dataset and perform exploratory data analysis.

● Preprocess the text data: remove stop words, perform tokenization, stem or lemmatize the words
etc.

● Split the dataset into a training set and a testing set

In [None]:
#Downloading dependencies/packages
!pip install tensorflow -q
!pip install rouge -q
!pip install textacy -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.4/320.4 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#Importing all libraries

import numpy as np
import pandas as pd

import re
import string
import csv

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow import keras
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Dense, Input, Embedding, Concatenate, TimeDistributed, Bidirectional, GRU
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.utils import plot_model

from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

In [None]:
# Loading the dataset
train = pd.read_csv('/content/train.csv', engine='python', on_bad_lines='skip')
test= pd.read_csv('/content/test.csv', engine='python', on_bad_lines='skip')
validation = pd.read_csv('/content/validation.csv', engine='python', on_bad_lines='skip')

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1951 entries, 0 to 1950
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   article     1951 non-null   object
 1   highlights  1951 non-null   object
dtypes: object(2)
memory usage: 30.6+ KB


In [None]:
train = train.head(50)
test = test.head(30)
validation = validation.head(20)

In [None]:
train['article'][0]

"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained 

In [None]:
train['highlights'][0]

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed .'

In [None]:
# Dropping unnecessary columns
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)
validation = validation.drop(['id'], axis=1)

In [None]:
# Function for cleaning the data
from textacy import preprocessing as tprep

process = tprep.make_pipeline(
    tprep.replace.emails,
    tprep.replace.emojis,
    tprep.replace.urls,
    tprep.replace.phone_numbers,
    tprep.replace.hashtags,
    tprep.replace.currency_symbols,
    lambda text: re.sub(r"\n", " ", text),
    tprep.remove.html_tags,
    tprep.remove.brackets,
    # tprep.remove.punctuation,
    tprep.normalize.hyphenated_words,
    tprep.normalize.quotation_marks,
    tprep.normalize.unicode,
    tprep.normalize.bullet_points,
    tprep.normalize.whitespace,
)

train['article'] = train['article'].apply(process)
test['article'] = test['article'].apply(process)
validation['article'] = validation['article'].apply(process)

In [None]:
# Function for cleaning the data
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

import spacy
nlp = spacy.load("en_core_web_sm")

def clean_article(para):
    para = para.lower()
    para = re.sub('[^a-zA-Z]', ' ', para)

    all_stopwords = stopwords.words('english')
    para = word_tokenize(para)
    para = [str(nlp(word)) for word in para if not word in set(stopwords.words('english'))]
    para = ' '.join(para)
    return para

train['article'] = train['article'].apply(clean_article)
test['article'] = test['article'].apply(clean_article)
validation['article'] = validation['article'].apply(clean_article)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def clean_highlights(para):
    para = para.lower()
    para = re.sub('[^a-zA-Z]', '', para)
    return para

train['highlights'] = train['highlights'].apply(clean_highlights)
test['highlights'] = test['highlights'].apply(clean_highlights)
validation['highlights'] = validation['highlights'].apply(clean_highlights)

In [None]:
# Text Length Analysis
import math

article_lengths = [len(article) for article in train['article']]
highlights_lengths = [len(summary) for summary in train['highlights']]

avg_article_length = math.floor( sum(article_lengths) / len(article_lengths))
avg_highlights_length = math.floor( sum(highlights_lengths) / len(highlights_lengths))

print(f"Average Article Length: {avg_article_length}")
print(f"Average Summary Length: {avg_highlights_length}")

Average Article Length: 2751
Average Summary Length: 306


In [None]:
# Split the dataset into article and highlights
X_train = train['article']
y_train = train['highlights']

X_test = test['article']
y_test = test['highlights']

X_val = validation['article']
y_val = validation['highlights']

In [None]:
# Maximum sequence lengths
maxlen_articles = max(len(sequence) for sequence in X_train)
maxlen_highlights = max(len(sequence) for sequence in y_train)

In [None]:
# Tokenize the articles and highlights
tokenizer_articles = Tokenizer()
tokenizer_articles.fit_on_texts(X_train)
X_train_seq = tokenizer_articles.texts_to_sequences(X_train)
X_test_seq = tokenizer_articles.texts_to_sequences(X_test)
X_val_seq = tokenizer_articles.texts_to_sequences(X_val)

tokenizer_highlights = Tokenizer()
tokenizer_highlights.fit_on_texts(y_train)
y_train_seq = tokenizer_highlights.texts_to_sequences(y_train)
y_test_seq = tokenizer_highlights.texts_to_sequences(y_test)
y_test_val = tokenizer_highlights.texts_to_sequences(y_val)

In [None]:
# Padding sequences
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen_articles, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen_articles, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=maxlen_articles, padding='post')

y_train_padded = pad_sequences(y_train_seq, maxlen=maxlen_highlights, padding='post')
y_test_padded = pad_sequences(y_test_seq, maxlen=maxlen_highlights, padding='post')
y_val_padded = pad_sequences(y_val_seq, maxlen=maxlen_highlights, padding='post')

In [None]:
# Data-analysis
print("Training Sequence", train_x.shape)
print('Target Values Shape', train_y.shape)
print('Test Sequence', test_x.shape)
print('Target Test Shape', test_y.shape)

In [None]:
# Vocabulary size
vocab_size_articles = len(tokenizer_articles.word_index) + 1
vocab_size_highlights = len(tokenizer_highlights.word_index) + 1

In [None]:
embedding_dim = 100

# 2. Model Building:

● Develop a sequence-to-sequence model for text summarization.
Explore architectures like LSTM(Long Short-Term Memory),
GRU (Gated Recurrent Units),
or even transformer-based modelslike
BERT (Bidirectional Encoder Representations from Transformers),
GPT (Generative Pre-trained Transformer), etc.

● Train the model on the training set and tune the hyperparameters for optimal performance.


In [None]:
# Encoder input
latent_dim = 128
encoder_input = Input(shape=(maxlen_articles,))
encoder_embedding = Embedding(vocab_size_articles, embedding_dim, trainable=True)(encoder_input)
encoder_lstm = LSTM(latent_dim*2, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

In [None]:
# Decoder input
decoder_inputs = Input(shape=(maxlen_highlights,))
decoder_embedding = Embedding(vocab_size_highlights, embedding_dim, trainable=False)(decoder_inputs)
decoder_lstm = LSTM(latent_dim*2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])
decoder_dense = Dense(vocab_size_highlights, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# Define the model
model = Model([encoder_input, decoder_inputs], decoder_outputs)
model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
# Train the model
model.fit([X_train_padded, y_train_padded[:, :-1]], y_train_padded[:, 1:], epochs=10, batch_size=128,
          validation_data=([X_val_padded, y_val_padded[:, :-1]], y_val_padded[:, 1:]), callbacks=[early_stop])

In [None]:
# Generate summaries for test data
encoder_model = Model(inputs= encoder_input, outputs=[state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim*2,))
decoder_state_input_c = Input(shape=(latent_dim*2,))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
# Function to generate summaries
def generate_summary(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_highlights.index_word[sampled_token_index]

        if sampled_word != 'end':
            decoded_sentence += ' ' + sampled_word

        if sampled_word == 'end' or len(decoded_sentence.split()) >= maxlen_highlights:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

In [None]:
# Generate summaries for test samples
for i in range(10):
    article_seq = X_test_padded[i:i+1]
    summary = generate_summary(article_seq)
    print(f"Article: {X_test[i]}")
    print(f"Summary: {summary}\n")

# 3. Evaluation:

● Test the model on the testing set.

● Evaluate the model using appropriate metrics such as ROUGE (Recall-Oriented Understudy for
Gisting Evaluation) scores, BLEU (Bilingual Evaluation Understudy) scores etc.

● Analyze the performance and discuss any limitations and potential improvements.


In [None]:
# Calculate ROUGE scores
rouge = Rouge()
scores = rouge.get_scores(decoded_sentence, test_data.highlights, avg=True)

print("ROUGE scores:")
for metric, values in scores.items():
    print(f"{metric}: {values['f']}")

In [None]:
# Calculate BLEU score
references_tokens = [reference.split() for reference in test_data.highlights]
candidates_tokens = [candidate.split() for candidate in decoded_sentence]
bleu_score = corpus_bleu(references_tokens, candidates_tokens)

print(f"\nBLEU score: {bleu_score}")