In [11]:

! git clone https://github.com/huggingface/transformers.git

fatal: destination path 'transformers' already exists and is not an empty directory.


In [None]:
! cd transformers && git reset --hard 143738214

In [2]:
! python --version

Python 3.10.13


In [13]:
import os
import pandas as pd
import re

from dotenv import load_dotenv
load_dotenv()

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

import tensorflow as tf
from transformers import T5Tokenizer, T5ForConditionalGeneration


# Initialize our model
tokenizer = T5Tokenizer.from_pretrained("orzhan/t5-long-extract")
model = T5ForConditionalGeneration.from_pretrained("orzhan/t5-long-extract")

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def read_file(filenameWithPath:str)->str:
  '''
  This function reads the content of a file.

  Parameters:
  filenameWithPath : str
    Absolute Path of the file which is to be read.

  Returns:
  text : str
    The content of the read file. 

  '''
  file = open(filenameWithPath,"r")
  text = file.read().strip()
  text = re.sub("\n"," ",text)
  return text

In [None]:
def preprocessing(text):
  '''
  The preprocessing function which parses raw text and manages to clean the raw text. This module removes websites/URLs, Email IDs, redundant spaces, extra periods, dashes and commas. It also replace some special unicode apostrope, qoutation symbols with ' and  ". This module also ads extra space after period, question mark and exclamation mark.

  Parameters:
    text : str
      The raw text to preprocess.

  Returns:
    text : str
      The preprocessed text. 
  '''
  # text = text.lower()
  text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", " ", text)
  text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', " ",text)
  text = re.sub("\u2019", '\'', text)
  text = re.sub("\u2018", '\'', text)
  text = re.sub("\u201C", '\"', text)
  text = re.sub("\u201D", '\"', text)
  # text = re.sub("[$]", "dollar ", text)
  # text = re.sub("[£]", "pound ", text)
  # text = re.sub("[%]", " percent", text)
  # text = re.sub(r"[^a-zA-Z0-9?!.,’-]", ' ', text)
  text = re.sub(r"([?!])", r" \1 ", text)
  text = re.sub(r',+', ',', text)
  text = re.sub(r'[-]+', ' ', text)
  text = re.sub(r'([a-z])(?=[.,])', r'\1 ', text)
  text = re.sub(r'\.{2,}', '.', text)
  text = re.sub(r"\s+", " ", text)
  return text

In [None]:
def createSentences(text):
    '''
    This function uses the nltk library to sentence tokenize a given corpus or document.

    Parameters:
    text : 
        The corpus or document string to split into sentences.

    Returns: 
        A sentence tokenized copy of *text* using NLTK's default tokenizer. A list of sentences.

    '''
    return nltk.tokenize.sent_tokenize(text)

In [None]:
def createChunks(sentences):
  '''
  This module splits the entire corpus into chunks. A chunk is a list of sentences from the corpus. This module add a sentence to a chunk if the tokenized sentence doesn't exceed the models' single sentence length. Else it adds the senence to next chunk.

  Parameters:
  sentences : list
    A list of containing sentences.

  Returns:
  chunks : list
    A list of containing the chunks of the document.
  '''
  length = 0
  chunk = ""
  chunks = []
  count = -1
  for sentence in sentences:
    # print("Sent ", sentence)
    count += 1
    combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

    if combined_length  < tokenizer.max_len_single_sentence - 3 : # if it doesn't exceed
      chunk += sentence + " " # add the sentence to the chunk
      length = combined_length # update the length counter
      # print("length",length)
      # if it is the last sentence
      if count == len(sentences) - 1:
        chunks.append(chunk.strip()) # save the chunk
      # print("if", chunks)
    else:
      chunks.append(chunk.strip()) # save the chunk
      # print("else", chunks)

      # reset
      length = 0
      chunk = ""

      # take care of the overflow sentence
      chunk += sentence + " "
      length = len(tokenizer.tokenize(sentence))
  return chunks

In [None]:
def generate_chunk_summary(chunk):
    '''
    This module generates the summary for a given document. Prior to passing the document to model, it appends 'summarize: ' prefix to document to let the model know that the task is summarization. It restricts the generated summary to 100 tokens.

    Parameters:
    chunk : str
        A chunk is group of sentences such that the tokenized lenght of chunk doesn't exceed the token limit of model.

    Returns:
    summary : str
        It is the summary which model generates for the given text chunk. 
    '''
    input_ids = tokenizer.encode("summarize: " + chunk, return_tensors="pt",max_length=tokenizer.model_max_length, truncation=True)
    summary_ids = model.generate(input_ids,max_length=100)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
def generate_summary(fileChunks):
    '''
    This function returns the complete summary for a chunked document or corpus of text.

    Parameters:
    fileChunks: list
        A list of chunk where chunk is group of sentences in a corpus such that tokenized length of chunk doesn't exceed the model limit.

    Returns:
        Concatenated Summary as *str* for every chunk. 
    '''
    summaries = []

    for chunk in fileChunks:
        summ = generate_chunk_summary(chunk)
        summaries.append(summ)

    return " ".join(summaries)

In [None]:
def cap_summary(generated_summary:str)->str:
    '''
    This function caps the generated summary to 1000 words. 

    Parameters:
    generated_summary : str
        System generated summary.

    Returns:
        Returns Summary as str after capping.
    '''
    sentences = nltk.sent_tokenize(generated_summary)
    new_summary=[]
    target_word=1250
    words_added=0
    for sentence in sentences:
        words= nltk.word_tokenize(sentence)
        if words_added + len(words) <= target_word:
            words_added+=len(words)
            new_summary.append(sentence)
        else:
            break

    return "".join(new_summary)

In [19]:
def summarize():
    '''
    This function generates the summary for all the documents in *data-dir* . It generates the summary or the document and writes the summary in *generated_summary* folder. The convention for written file is *filename_sysSumm.txt*
    '''
    data_dir = os.getenv('VAL_AR')
    for file in os.listdir(data_dir):
        
        fileContent = read_file(os.path.join(data_dir,file))
        filePreprocessed = preprocessing(fileContent)
        fileSentences = createSentences(filePreprocessed)
        fileChunks = createChunks(fileSentences)
        summary = generate_summary(fileChunks)
        summary = cap_summary(summary)
        filename = file[:-4]

        with open(f"./{os.getenv('TARGET_DIR')}/{filename}_sysSumm.txt","w+") as f:
            f.write(summary)

        # break
        


summarize()

In [20]:
# Run on single file
valDir= os.getenv('VAL_AR')

testFile = os.path.join(valDir,os.listdir(valDir)[0])

fileContent = read_file(testFile)
filePreprocessed = preprocessing(fileContent)
fileSentences = createSentences(filePreprocessed)
fileChunks = createChunks(fileSentences)
summary = generate_summary(fileChunks)
summary=cap_summary(summary)
summary

