<img src="https://www.scienze.unimib.it/sites/sc02/files/scientifica_logo_scuola.png" align="left" width="150px" height="160px">

<h1>
Text Mining Project <br>
Task 2: <b>Text Summarization</b>
on <a href="https://data.mendeley.com/datasets/9rw3vkcfy4/6">Web of Science</a> Dataset
</h1>

<h3>Students: Arizzi Sara 845374, Bidone Federico 892054</h3>


# Setup

In [None]:
#@title Libraries
from google.colab import drive

import requests
import re
import time
import networkx as nx
import numpy as np
import pandas as pd
from zipfile import ZipFile
from io import BytesIO
import gzip
import shutil
from datetime import datetime
import pickle
import random

# Extractive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize

# BART Fine-Tuned
!pip install git+https://github.com/keras-team/keras-nlp.git -q
import os
import keras_nlp
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
os.environ["KERAS_BACKEND"] = "tensorflow"

# BART Pre-Trained
from transformers import pipeline

# Evaluation
! pip install rouge
import rouge

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
#@title Dataset Import
#@markdown Download dataset from <a href="https://data.mendeley.com/public-files/datasets/9rw3vkcfy4/files/c9ea673d-5542-44c0-ab7b-f1311f7d61df/file_downloaded">source</a>

zip_file_url = "https://data.mendeley.com/public-files/datasets/9rw3vkcfy4/files/c9ea673d-5542-44c0-ab7b-f1311f7d61df/file_downloaded"

r = requests.get(zip_file_url)
if r.ok:
  z = ZipFile(BytesIO(r.content))
  z.extractall(".")
else:
  print("Request to source has gone wrong")

In [None]:
#@title Read Data
#@markdown - Read X: train text
f = open("WOS5736/X.txt", "r")
all = f.read()
X = all.split("\n")

#@markdown - Read Y: summaries
drive.mount('/content/drive')
f = open("drive/MyDrive/Progetto_Text_Mining/other/summaries5736.txt", "r")
all = f.read()
Y = all.split("\n")

#@markdown - Read Glove 6B 100d file
GLOVE_FILE = "drive/MyDrive/glove.6B.100d.txt.gz"
with gzip.open(GLOVE_FILE, 'rb') as f_in:
  with open('glove.txt', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [None]:
#@title Shuffle and Split
c = list(zip(X, Y))
random.shuffle(c)
X, Y = zip(*c)

#@title Split data into Train/Test
train_abs, test_abs = np.split(X, [int(.8*len(X))])
train_sum, test_sum = np.split(Y, [int(.8*len(Y))])

In [None]:
#@title Rouge Evaluation Function

def evaluate_summary(y_test, predicted):
  rouge_score = rouge.Rouge()
  scores = rouge_score.get_scores(y_test, predicted, avg=True)
  score_1 = round(scores['rouge-1']['f'], 2)
  score_2 = round(scores['rouge-2']['f'], 2)
  score_L = round(scores['rouge-l']['f'], 2)

  return [
    score_1,
    score_2,
    score_L,
    round(np.mean([score_1, score_2, score_L]), 3)
  ]

# Extractive Summarization

#### Tokenization and Normalization

In [None]:
# english stopwords
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def normalize_document(doc):
  # lower case and remove special characters\whitespaces
  doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
  doc = doc.lower()
  doc = doc.strip()
  # tokenize document
  tokens = nltk.word_tokenize(doc)
  # filter stopwords out of document
  filtered_tokens = [token for token in tokens if token not in stop_words]
  # re-create document from filtered tokens
  doc = ' '.join(filtered_tokens)
  return doc

### GLOvE



In [None]:
word_embeddings = {}
f = open("glove.txt", encoding="utf-8")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype="float32")
  word_embeddings[word] = coefs
f.close()

In [None]:
rouge_glove = []
for gt, abstract in zip(test_sum, test_abs):

  try:

    #@markdown 1. Tokenization
    sentences = nltk.sent_tokenize(abstract)

    #@markdown 2. Normalization
    normalize_corpus = np.vectorize(normalize_document)
    norm_sentences = normalize_corpus(sentences)

    # number of sentences in abstract
    num_sent = len(sentences)

    #@markdown 3. Get GloVe sentence embeddings
    sentence_vectors = []
    for i in norm_sentences:
      v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
      sentence_vectors.append(v)

    #@markdown 4. Calculate similarity matrix
    sim_mat = np.zeros([num_sent, num_sent])
    for i in range(num_sent):
      for j in range(num_sent):
        if i != j:
          sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))[0,0]
    sim_mat = np.round(sim_mat, 3)

    #@markdown 5. PageRank score
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)

    #@markdown 6. Get Summary (top 4 sentences)
    ranked_sentences = sorted(((scores[i],i) for i,s in enumerate(sentences)), reverse=True)
    arranged_sentences = sorted(ranked_sentences[0:4], key=lambda x:x[1])
    pred = " ".join([sentences[x[1]] for x in arranged_sentences])

    #@markdown 7. Calculate Rouge
    r = evaluate_summary(gt, pred)
    rouge_glove.append(r)
  except Exception:
    rouge_glove.append([])

In [None]:
avg = [el[0] for el in rouge_glove if len(el) > 3]

In [None]:
np.mean(avg)

0.4567982456140351

### TF-IDF Vectorization

In [None]:
rouge_tfidf = []
for gt, abstract in zip(test_sum, test_abs):
  try:
    #@markdown 1. Tokenization
    sentences = nltk.sent_tokenize(abstract)

    #@markdown 2. Normalization
    normalize_corpus = np.vectorize(normalize_document)
    norm_sentences = normalize_corpus(sentences)

    # number of sentences in abstract
    num_sent = len(sentences)

    #@markdown 3. Vectorization
    tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
    dt_matrix = tv.fit_transform(norm_sentences)
    dt_matrix = dt_matrix.toarray()

    #@markdown 4. Calculate similarity matrix
    similarity_matrix = np.matmul(dt_matrix, dt_matrix.T)

    #@markdown 5. PageRank score
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ranked_sentences = sorted(((score, index) for index, score in scores.items()), reverse=True)

    num_sentences = 4
    top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)]
    top_sentence_indices.sort()

    pred = " ".join(np.array(sentences)[top_sentence_indices])
    r = evaluate_summary(gt, pred)
    rouge_tfidf.append(r)
  except Exception:
    rouge_tfidf.append([])

In [None]:
avg = [el[0] for el in rouge_tfidf if len(el) > 1]

In [None]:
np.mean(avg)

0.45531611754229745

# Abstractive Summarization

### Fine-Tuning BART

In [None]:
RETRAIN = False
BATCH_SIZE = 8
EPOCHS = 10
MAX_ENCODER_SEQUENCE_LENGTH = 512
MAX_DECODER_SEQUENCE_LENGTH = 128
MAX_GENERATION_LENGTH = 200

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((tf.constant(train_abs), tf.constant(train_sum)))

In [None]:
train_ds = (
  train_ds.map(
    lambda abstract, summary: {"encoder_text": abstract, "decoder_text": summary}
  )
  .batch(BATCH_SIZE)
  .cache()
)

In [None]:
preprocessor = keras_nlp.models.BartSeq2SeqLMPreprocessor.from_preset(
  "bart_base_en",
  encoder_sequence_length=MAX_ENCODER_SEQUENCE_LENGTH,
  decoder_sequence_length=MAX_DECODER_SEQUENCE_LENGTH,
)
bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(
  "bart_base_en", preprocessor=preprocessor
)

bart_lm.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/bart/keras/bart_base_en/2/download/tokenizer.json...
100%|██████████| 448/448 [00:00<00:00, 308kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bart/keras/bart_base_en/2/download/assets/tokenizer/merges.txt...
100%|██████████| 446k/446k [00:00<00:00, 1.86MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bart/keras/bart_base_en/2/download/assets/tokenizer/vocabulary.json...
100%|██████████| 0.99M/0.99M [00:00<00:00, 3.58MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bart/keras/bart_base_en/2/download/config.json...
100%|██████████| 483/483 [00:00<00:00, 246kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/bart/keras/bart_base_en/2/download/model.weights.h5...
100%|██████████| 532M/532M [00:13<00:00, 41.8MB/s]
  return id(getattr(self, attr)) not in self._functional_layer_ids
  return id(getattr(self, attr)) not in self._functional_layer_ids


In [None]:
optimizer = keras.optimizers.AdamW(
  learning_rate=5e-5,
  weight_decay=0.01,
  epsilon=1e-6,
  global_clipnorm=1.0
)

optimizer.exclude_from_weight_decay(var_names=["bias"])
optimizer.exclude_from_weight_decay(var_names=["gamma"])
optimizer.exclude_from_weight_decay(var_names=["beta"])

loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)

bart_lm.compile(
  optimizer=optimizer,
  loss=loss,
  weighted_metrics=["accuracy"],
)

In [None]:
bart_lm.fit(train_ds, epochs=EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7bbecda953c0>

In [None]:
def generate_text(model, input_text, max_length=200, print_time_taken=False):
  start = time.time()
  output = model.generate(input_text, max_length=max_length)
  end = time.time()
  print(f"Total Time Elapsed: {end - start:.2f}s")
  return output

In [None]:
test_ds = tf.data.Dataset.from_tensor_slices(tf.constant(test_abs))

In [None]:
_ = generate_text(bart_lm, "sample text", max_length=MAX_GENERATION_LENGTH)

generated_summaries = generate_text(
  bart_lm,
  test_ds.map(lambda abstract: abstract).batch(8),
  max_length=MAX_GENERATION_LENGTH,
  print_time_taken=True,
)

Total Time Elapsed: 25.56s
Total Time Elapsed: 273.50s


In [None]:
rouge_ft_bart = []
for pred, gt in zip(generated_summaries, test_sum):
  try:
    r = evaluate_summary(gt, pred)
    rouge_ft_bart.append(r)
  except Exception:
    rouge_ft_bart.append([])

In [None]:
avg = [el[0] for el in rouge_ft_bart if len(el) > 1]

In [None]:
np.mean(avg)

0.07140243902439024

### Pre-Trained BART

In [None]:
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
EVALUATE = False
if EVALUATE:
  rouge_pt_bart = []
  print(f"Start time --> {datetime.now().time()}")
  for idx, abstract in enumerate(test_abs):
    try:
      max_len = int(len(abstract.split(" ")) / 2)
      sum = summarizer(abstract, max_length=max_len, min_length=10, do_sample=False)
      pred = sum[0].get("summary_text")
      r = evaluate_summary(test_sum[idx], pred)
      rouge_pt_bart.append(r)
      if idx % 50 == 0:
        print(f"Time: {datetime.now().time()} - Completed {round(idx/len(test_abs)*100, 2)}%")
    except Exception:
      rouge_pt_bart.append([])
  with open('rouge_pt_bart.pkl', 'wb') as f:
    pickle.dump(rouge_pt_bart, f)
else:
  with open('drive/MyDrive/Progetto_Text_Mining/other/rouge_pt_bart.pkl', 'rb') as f:
    rouge_pt_bart = pickle.load(f)

In [None]:
avg = [el[0] for el in rouge_pt_bart if len(el) > 1]

In [None]:
np.mean(avg)

0.40572299651567945

# Performance Comparison

In [None]:
experiments = ["Ext Glove", "Ext TF-IDF", "Abs Fine-Tuned BART", "Abs Pre-Trained BART"]
results = [rouge_glove, rouge_tfidf, rouge_ft_bart, rouge_pt_bart]

for e, r in zip(experiments, results):
  print(f"{e} ------------------------------------")
  rouge_1 = [el[0] for el in r if len(el) > 0]
  rouge_2 = [el[1] for el in r if len(el) > 1]
  rouge_L = [el[2] for el in r if len(el) > 2]
  rouge_avg = [el[3] for el in r if len(el) > 3]
  print(f"ROUGE-1 --> {np.mean(rouge_1)}")
  print(f"ROUGE-2 --> {np.mean(rouge_2)}")
  print(f"ROUGE-L --> {np.mean(rouge_L)}")
  print(f"ROUGE Average --> {np.mean(rouge_avg)}")

Ext Glove ------------------------------------
ROUGE-1 --> 0.4567982456140351
ROUGE-2 --> 0.24134210526315789
ROUGE-L --> 0.42937719298245614
ROUGE Average --> 0.37584298245614034
Ext TF-IDF ------------------------------------
ROUGE-1 --> 0.45531611754229745
ROUGE-2 --> 0.24707925200356187
ROUGE-L --> 0.43005342831700805
ROUGE Average --> 0.3774853072128228
Abs Fine-Tuned BART ------------------------------------
ROUGE-1 --> 0.07140243902439024
ROUGE-2 --> 0.00593205574912892
ROUGE-L --> 0.0682404181184669
ROUGE Average --> 0.048505226480836244
Abs Pre-Trained BART ------------------------------------
ROUGE-1 --> 0.40572299651567945
ROUGE-2 --> 0.20483449477351917
ROUGE-L --> 0.3793815331010453
ROUGE Average --> 0.3299825783972125
