<a href="https://colab.research.google.com/github/sarim711/BART_Kmeans_TextSummarisation/blob/main/Text_Summarisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating medical text

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
tqdm.pandas()


In [None]:
df = pd.read_csv("mg_small.csv",usecols = ["text"])

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 10)  # Adjust as needed to avoid truncation
df.head(2)

In [None]:
def clean(text):

     '''removes anything other than standard punctuation
        and letters. Sentences should be separated by a period and a single space.
        using re for this.'''
    #Remove non-standard characters
    # text = re.sub(r'[^a-zA-Z\.\s]', '', text)

    #After updating the code to include numbers
    text = re.sub(r'[^a-zA-Z0-9\.\s]', '', text)

    #Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    #Ensure proper sentence separation (period followed by a single space)
    text = re.sub(r'\.(?=[^\s])', '. ', text)

    #Strip leading/trailing spaces
    text = text.strip()

    return text

print(clean(df.text.iloc[0]))
df.cleaned = df.text.apply(clean)

In [None]:
from collections import Counter
def calculate_word_frequency(document):
    '''return a dictionary that contains the
       frequencies of each word in the document'''
    #tokenize document into words
    words = re.findall(r'\b\w+\b', document.lower())

    # Calculate the frequency of each word
    word_count = Counter(words)

    # Normalize the frequency by the total number of words
    total_words = sum(word_count.values())
    word_freq = {word: count / total_words for word, count in word_count.items()}

    return word_freq

word_freq = calculate_word_frequency(df.cleaned.iloc[0])
word_freq

In [None]:
def calculate_sentence_scores(document, word_freq):
    '''Calculating the representativeness score of each sentence
       by summing up the frequency of the words in each sentence,
       then dividing by sentence length'''
    # Split the document into sentences
    sentences = re.split(r'(?<!\w\.\w\.)(?<![A-Z][a-z]\.)(?<=\.\s|\?\s|\!\s)', document)

    # Calculate scores for each sentence
    scores = []
    for sentence in sentences:
        words = re.findall(r'\b\w+\b', sentence.lower())
        sentence_score = sum(word_freq.get(word, 0) for word in words)
        if len(words) >5 :
            sentence_score /= len(words)  # Normalize by sentence length
        scores.append(sentence_score)

    return scores

sentence_scores = calculate_sentence_scores(df.cleaned.iloc[0], word_freq)
sentence_scores[:3]

In [None]:
def get_top_k_sentences(document, sentence_scores, k):
  '''Returning a list containing the top k most representative sentences'''
    # Split the document into sentences
    sentences = re.split(r'(?<!\w\.\w\.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', document)

    # Pair each sentence with its score and sort by score (descending order)
    ranked_sentences = sorted(zip(sentences, sentence_scores), key=lambda x: x[1], reverse=True)

    # Select the top-k sentences
    top_k_sentences = [sentence for sentence, score in ranked_sentences[:k]]

    return top_k_sentences
top_k_sentences = get_top_k_sentences(df.cleaned.iloc[0], sentence_scores,3)
top_k_sentences

In [None]:

def create_summary(document, top_k):
    cleaned = clean(document)
    word_freq = calculate_word_frequency(cleaned)
    sentence_scores = calculate_sentence_scores(cleaned, word_freq)
    top_k_sentences = get_top_k_sentences(document, sentence_scores, top_k)
    summary = ". ".join(top_k_sentences)
    return summary

df["statistical_summary"] = df.text.progress_apply(lambda text: create_summary(text, 3))

In [None]:
# Adjust pandas settings to display full lines
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 10)  # Adjust as needed to avoid truncation

df['statistical_summary']

## K-means

We now graduate to using neural networks to improve our summarizations. The way this is done is by the following procedure:

1) Embed each sentence using bio-medical BERT (use whichever model you prefer, I enjoy PubMedBERT). Unfortunately, we cannot use the CLS token to create a general sentence embedding because that requires task-specific downstream training for a labeled dataset. Therefore, we take the average over the tokens in the sentence to create an embedding.

2) Cluster the sentence embeddings using [K-Means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html).

3) For each cluster, choose the sentence that is closest to that cluster's centroid. Order the sentences according to their in-text appearance.

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM,AutoModel
import torch

# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the PubMedBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract").to(device)

def embed_using_bert(bert_model, tokenizer, document):
    sentences = document.split(". ")
    sentence_embeddings = []

    for sentence_index, sentence in enumerate(sentences):
        # Tokenize the sentence and move input tensors to GPU if available
        tokens = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(device)

        # Extract embeddings for tokens
        with torch.no_grad():
            outputs = bert_model(**tokens)

        # Get the token embeddings (hidden states of the last layer)
        token_embeddings = outputs.last_hidden_state.squeeze(0)

        # Compute the mean embedding for the sentence
        sentence_embedding = token_embeddings.mean(dim=0).cpu().numpy()  # Move back to CPU
        sentence_embeddings.append(sentence_embedding)

    return sentences, np.array(sentence_embeddings)


In [None]:
from sklearn.cluster import KMeans

def cluster_sentence_embeddings(sentence_embeddings, sentences, k_clusters):
    # Remove duplicate embeddings
    unique_embeddings, unique_indices = np.unique(sentence_embeddings, axis=0, return_index=True)
    unique_sentences = [sentences[i] for i in unique_indices]

    # Adjust k to the number of unique embeddings
    k_clusters = min(k_clusters, len(unique_embeddings))

    if k_clusters == 0:  # No sentences to cluster
        return ""

    # Perform K-Means clustering
    kmeans = KMeans(n_clusters=k_clusters, random_state=42).fit(unique_embeddings)
    cluster_attributions = kmeans.labels_
    cluster_centers = kmeans.cluster_centers_

    # Compute the sentence closest to each cluster center
    representative_sentences = []
    for cluster_idx in range(k_clusters):
        cluster_indices = np.where(cluster_attributions == cluster_idx)[0]
        if len(cluster_indices) == 0:
            continue  # Skip empty clusters
        cluster_embeddings = unique_embeddings[cluster_indices]
        cluster_centroid = cluster_centers[cluster_idx]

        # Find the sentence closest to the centroid
        closest_idx = cluster_indices[np.argmin(np.linalg.norm(cluster_embeddings - cluster_centroid, axis=1))]
        representative_sentences.append((unique_indices[closest_idx], unique_sentences[closest_idx]))

    # Order sentences by their appearance in the original document
    representative_sentences.sort(key=lambda x: x[0])
    k_sentences_in_order = [sentence for _, sentence in representative_sentences]

    return ". ".join(k_sentences_in_order)


In [None]:
def summarize_document_with_kmeans(document, k):
    #Embed sentences
    sentences, embeddings = embed_using_bert(model, tokenizer, document)

    if len(sentences) >= k:
        summary = cluster_sentence_embeddings(embeddings, sentences, k)
    else:

        summary = document if len(sentences) < 2 else ". ".join(sentences[:k])

    return summary

tqdm.pandas()  # Enable progress bar for DataFrame operations
df['cleaned'] = df['text'].apply(clean)
df['neural_summary'] = df['cleaned'].progress_apply(lambda text: summarize_document_with_kmeans(text, k=5))

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 10)  # Adjust as needed to avoid truncation
df['neural_summary']

In [None]:
'''
   Reduce dataset to 10,000 cleaned text as we have limited GPU access time on collab
   From Reduced cleaned text docs we can still analyze summary quality effectively, while varying k.
'''
# Reduce the dataset to 10,000 documents for faster processing
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Reapply the cleaned column processing if not already done
if 'cleaned' not in df.columns:
    df['cleaned'] = df['text'].apply(clean)



In [None]:
#Changing k to 3

df['neural_summary'] = df['cleaned'].progress_apply(lambda text: summarize_document_with_kmeans(text, k=3))
df['neural_summary']

In [None]:
#changing k to 7
df['neural_summary'] = df['cleaned'].progress_apply(lambda text: summarize_document_with_kmeans(text, k=7))
df['neural_summary']

In [None]:
#Changing token embeddings to max
def embed_using_bert(bert_model, tokenizer, document):
    sentences = document.split(". ")
    sentence_embeddings = []

    for sentence_index, sentence in enumerate(sentences):
        # Tokenize the sentence and move input tensors to GPU if available
        tokens = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(device)

        # Extract embeddings for tokens
        with torch.no_grad():
            outputs = bert_model(**tokens)

        # Get the token embeddings (hidden states of the last layer)
        token_embeddings = outputs.last_hidden_state.squeeze(0)

        # Compute the embedding for the sentence
        sentence_embedding = token_embeddings.max(dim=0).values.cpu().numpy()  # Move to CPU before converting to NumPy
        sentence_embeddings.append(sentence_embedding)

    return sentences, np.array(sentence_embeddings)

df['neural_summary'] = df['cleaned'].progress_apply(lambda text: summarize_document_with_kmeans(text, k=5))

df['neural_summary']

## BART

We will now use dedicated sequence-to-sequence neural networks. These models learn to map input sequences of text to different length output sequences of text. They do this with larged labeled summary datasets. It would be great to train (or even fine-tune) our own bio-medical summarization model, but unfortunately this requires datasets which do not exist (to my knowledge). Thus, we will use [BART](https://huggingface.co/facebook/bart-large), a general purpose encoder-decoder model from meta. This is the most opaque but potentially best performing summarization model, as it deals with actual text generation. Unlike the previous models, which pick and choose pre-existing texts, sequence-to-sequence models generate a completely new summary. Thus, there is only one step: model evaluation.

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

import torch
if torch.cuda.is_available():
    print('GPU available!')
    device = torch.cuda.current_device()
else:
    print('GPU unavailable - CPU will be used for all calculations')
    device = None

# Tokenizer and model loading for bart-large-cnn

tokenizer=BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model=BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)


#for multi-line output, thanks to https://stackoverflow.com/questions/58890109/line-wrapping-in-collaboratory-google-results
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# Transmitting the encoded inputs to the model.generate() function
inputs = tokenizer.batch_encode_plus([df.cleaned.iloc[10]],return_tensors='pt').to(device)
summary_ids =  model.generate(inputs['input_ids'], num_beams=4, max_length=150, early_stopping=True)
# Decoding and printing the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary)
print("\n===\n")
print(df.cleaned.iloc[10])

In [None]:
'''reduced dataset to 2000 cleaned texts as the entire dataset was expected to take 5 hours, GPU would'nt ave lasted that long'''
df = df.sample(n=2000, random_state=42).reset_index(drop=True)
# Reapply the cleaned column processing if not already done
if 'cleaned' not in df.columns:
    df['cleaned'] = df['text'].apply(clean)


In [None]:

# Function to summarize using BART
def bart_summarize(text):
    # Check if text is empty or None
    if not text or text.strip() == "":
        return ""  # Return empty summary for empty text

    inputs = tokenizer.batch_encode_plus(
        [text],
        return_tensors='pt',
        truncation=True,  # Truncate if sequence is too long
        max_length=1024, # Set a maximum sequence length
        padding="max_length" # Pad shorter sequences to the maximum length
    ).to(device)

    summary_ids = model.generate(
        inputs['input_ids'],
        num_beams=4,
        max_length=150,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Apply the BART summarization to the reduced dataset
df['bart_summary'] = df['cleaned'].progress_apply(bart_summarize)


In [None]:
df['bart_summary']

In [None]:
# Function to summarize with custom num_beams and max_length
def bart_summarize_with_params(text, num_beams=4, max_length=150):
    inputs = tokenizer.batch_encode_plus([text], return_tensors='pt', truncation=True).to(device)
    summary_ids = model.generate(
        inputs['input_ids'],
        num_beams=num_beams,
        max_length=max_length,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example: Apply to a single document with different parameters
text = df.cleaned.iloc[10]  # Example cleaned document

# Experiment with different values of num_beams and max_length
results = {
    "num_beams=2, max_length=100": bart_summarize_with_params(text, num_beams=2, max_length=100),
    "num_beams=4, max_length=150": bart_summarize_with_params(text, num_beams=4, max_length=150),
    "num_beams=4, max_length=200": bart_summarize_with_params(text, num_beams=4, max_length=200),
    "num_beams=6, max_length=200": bart_summarize_with_params(text, num_beams=6, max_length=200),
}

# Print results for analysis
for params, summary in results.items():
    print(f"\nParameters: {params}\nSummary: {summary}")
