In [11]:

! git clone https://github.com/huggingface/transformers.git

fatal: destination path 'transformers' already exists and is not an empty directory.


In [None]:
! cd transformers && git reset --hard 143738214

In [2]:
! python --version

Python 3.10.13


In [3]:
import os
import pandas as pd
import re

from dotenv import load_dotenv
load_dotenv()

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration


# Initialize our model
tokenizer = T5Tokenizer.from_pretrained("orzhan/t5-long-extract")
model = T5ForConditionalGeneration.from_pretrained("orzhan/t5-long-extract")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


In [14]:
def read_file(filenameWithPath:str)->str:
  '''
  This function reads the content of a file.

  Parameters:
  filenameWithPath : str
    Absolute Path of the file which is to be read.

  Returns:
  text : str
    The content of the read file. 

  '''
  file = open(filenameWithPath,"r")
  text = file.read().strip()
  text = re.sub("\n"," ",text)
  return text

In [15]:
def preprocessing(text):
  '''
  The preprocessing function which parses raw text and manages to clean the raw text. This module removes websites/URLs, Email IDs, redundant spaces, extra periods, dashes and commas. It also replace some special unicode apostrope, qoutation symbols with ' and  ". This module also ads extra space after period, question mark and exclamation mark.

  Parameters:
    text : str
      The raw text to preprocess.

  Returns:
    text : str
      The preprocessed text. 
  '''
  # text = text.lower()
  text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", " ", text)
  text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', " ",text)
  text = re.sub("\u2019", '\'', text)
  text = re.sub("\u2018", '\'', text)
  text = re.sub("\u201C", '\"', text)
  text = re.sub("\u201D", '\"', text)
  # text = re.sub("[$]", "dollar ", text)
  # text = re.sub("[£]", "pound ", text)
  # text = re.sub("[%]", " percent", text)
  # text = re.sub(r"[^a-zA-Z0-9?!.,’-]", ' ', text)
  text = re.sub(r"([?!])", r" \1 ", text)
  text = re.sub(r',+', ',', text)
  text = re.sub(r'[-]+', ' ', text)
  text = re.sub(r'([a-z])(?=[.,])', r'\1 ', text)
  text = re.sub(r'\.{2,}', '.', text)
  text = re.sub(r"\s+", " ", text)
  return text

In [16]:
def createSentences(text):
    '''
    This function uses the nltk library to sentence tokenize a given corpus or document.

    Parameters:
    text : 
        The corpus or document string to split into sentences.

    Returns: 
        A sentence tokenized copy of *text* using NLTK's default tokenizer. A list of sentences.

    '''
    return nltk.tokenize.sent_tokenize(text)

In [17]:
def createChunks(sentences):
  '''
  This module splits the entire corpus into chunks. A chunk is a list of sentences from the corpus. This module add a sentence to a chunk if the tokenized sentence doesn't exceed the models' single sentence length. Else it adds the senence to next chunk.

  Parameters:
  sentences : list
    A list of containing sentences.

  Returns:
  chunks : list
    A list of containing the chunks of the document.
  '''
  length = 0
  chunk = ""
  chunks = []
  count = -1
  for sentence in sentences:
    # print("Sent ", sentence)
    count += 1
    combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

    if combined_length  < tokenizer.max_len_single_sentence - 3 : # if it doesn't exceed
      chunk += sentence + " " # add the sentence to the chunk
      length = combined_length # update the length counter
      # print("length",length)
      # if it is the last sentence
      if count == len(sentences) - 1:
        chunks.append(chunk.strip()) # save the chunk
      # print("if", chunks)
    else:
      chunks.append(chunk.strip()) # save the chunk
      # print("else", chunks)

      # reset
      length = 0
      chunk = ""

      # take care of the overflow sentence
      chunk += sentence + " "
      length = len(tokenizer.tokenize(sentence))
  return chunks

In [18]:
def generate_chunk_summary(chunk):
    '''
    This module generates the summary for a given document. Prior to passing the document to model, it appends 'summarize: ' prefix to document to let the model know that the task is summarization. It restricts the generated summary to 100 tokens.

    Parameters:
    chunk : str
        A chunk is group of sentences such that the tokenized lenght of chunk doesn't exceed the token limit of model.

    Returns:
    summary : str
        It is the summary which model generates for the given text chunk. 
    '''
    input_ids = tokenizer.encode("summarize: " + chunk, return_tensors="pt",max_length=tokenizer.model_max_length, truncation=True)
    summary_ids = model.generate(input_ids,max_length=100)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [19]:
def generate_summary(fileChunks):
    '''
    This function returns the complete summary for a chunked document or corpus of text.

    Parameters:
    fileChunks: list
        A list of chunk where chunk is group of sentences in a corpus such that tokenized length of chunk doesn't exceed the model limit.

    Returns:
        Concatenated Summary as *str* for every chunk. 
    '''
    summaries = []

    for chunk in fileChunks:
        summ = generate_chunk_summary(chunk)
        summaries.append(summ)

    return " ".join(summaries)

In [20]:
def cap_summary(generated_summary:str)->str:
    '''
    This function caps the generated summary to 1000 words. 

    Parameters:
    generated_summary : str
        System generated summary.

    Returns:
        Returns Summary as str after capping.
    '''
    sentences = nltk.sent_tokenize(generated_summary)
    new_summary=[]
    target_word=1250
    words_added=0
    for sentence in sentences:
        words= nltk.word_tokenize(sentence)
        if words_added + len(words) <= target_word:
            words_added+=len(words)
            new_summary.append(sentence)
        else:
            break

    return "".join(new_summary)

In [19]:
def summarize():
    '''
    This function generates the summary for all the documents in *data-dir* . It generates the summary or the document and writes the summary in *generated_summary* folder. The convention for written file is *filename_sysSumm.txt*
    '''
    data_dir = os.getenv('VAL_AR')
    for file in os.listdir(data_dir):
        
        fileContent = read_file(os.path.join(data_dir,file))
        filePreprocessed = preprocessing(fileContent)
        fileSentences = createSentences(filePreprocessed)
        fileChunks = createChunks(fileSentences)
        summary = generate_summary(fileChunks)
        summary = cap_summary(summary)
        filename = file[:-4]

        with open(f"./{os.getenv('TARGET_DIR')}/{filename}_sysSumm.txt","w+") as f:
            f.write(summary)

        # break
        


summarize()

In [22]:
# Run on single file


valARDir= os.getenv('VAL_AR') # Init Validation annual report directory
valGSDir = os.getenv('VAL_GS') # init validation gold summary diredtory

testFile = os.path.join(valARDir,os.listdir(valARDir)[0]) # specifying which file to read, here 1st file
testFileName = os.path.basename(testFile)[:-4] # get just the filename

# get the gold summary for the previous file
# Of all the gold summary corressponding to  above file, we pick the first one.
gs = [file for file in os.listdir(valGSDir) if file.__contains__(testFileName)][0]
gs = os.path.join(valGSDir,gs)

fileContent = read_file(testFile) # read the annual report
gs = read_file(gs) # read the gold summary
filePreprocessed = preprocessing(fileContent)
fileSentences = createSentences(filePreprocessed)
fileChunks = createChunks(fileSentences)
summary = generate_summary(fileChunks)
summary=cap_summary(summary)
summary # generated summary for the read annual report.

'25695 19 March 2018 3:29 PM Proof 7 25695 19 March 2018 3:29 PM Proof 7 annual report 2017 Bodycote plc annual report 2017 | Stock code: BOY 25695 19 March 2018 3:29 PM Proof 7 At a glance Our structure The Group operates 187 facilities around the world which are organised into customer focused division Throughout this report you will see illustrations which link our business and strategy: Strategy & Core Values Key Performance Indicators £ Aerospace, Defence & Energy Rapid growth countries Customer service Automotive & General Industrial Technology Core values Return on capital employed Headline earnings per share Accident frequency Return on sales Headline operating cash flow Carbon footprint Headline operating profit and headline profit before taxation exclude amortisation of acquired intangibles of £4.5m (2016: £4.5m) and acquisition costs of £nil (2016: £0.6m).2.Return on sales is defined as headline operating profit as a percentage of revenue.This Strategic report has been prepa

In [26]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2' ,'rougeL'], use_stemmer=True)
scores = scorer.score(gs,summary)
# print(scores)

# Print rouge metrics
print("Rouge 1 : ")
print(f"\tPrecision : {scores['rouge1'][0]}")
print(f"\tRecall : {scores['rouge1'][1]}")
print(f"\tF1 Score : {scores['rouge1'][2]}")

print("Rouge 2 : ")
print(f"\tPrecision : {scores['rouge2'][0]}")
print(f"\tRecall : {scores['rouge2'][1]}")
print(f"\tF1 Score : {scores['rouge2'][2]}")

print("Rouge L : ")
print(f"\tPrecision : {scores['rougeL'][0]}")
print(f"\tRecall : {scores['rougeL'][1]}")
print(f"\tF1 Score : {scores['rougeL'][2]}")

{'rouge1': Score(precision=0.6546572934973638, recall=0.47634271099744246, fmeasure=0.5514433752775721), 'rouge2': Score(precision=0.31838170624450307, recall=0.23160588611644273, fmeasure=0.2681481481481481), 'rougeL': Score(precision=0.26362038664323373, recall=0.1918158567774936, fmeasure=0.2220577350111029)}
Rouge 1 : 
	Precision : 0.6546572934973638
	Recall : 0.47634271099744246
	F1 Score : 0.5514433752775721
Rouge 2 : 
	Precision : 0.31838170624450307
	Recall : 0.23160588611644273
	F1 Score : 0.2681481481481481
Rouge L : 
	Precision : 0.26362038664323373
	Recall : 0.1918158567774936
	F1 Score : 0.2220577350111029
