In [2]:
import dataset
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint,EarlyStopping
from data_preprocessor import TextPreprocessing
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import get_linear_schedule_with_warmup, AdamW
import re

device = "cuda" if torch.cuda.is_available() else "cpu"

class TextPreprocessing():
    def __init__(self ,
                 regexList = None,
                 punct= True,
                 lowercase= True,
                 slang= False,
                 stopwordList = None,
                 stemming = False,
                 lemmatization= False ):

        self.convertToLowercase = lowercase #Done
        self.removePunctuations = punct #Done
        self.regexList = regexList  # Done
        self.removeSlang = slang #Done
        self.stopwordList = stopwordList #Done
        self.useStemming = stemming #Done
        self.useLemmatization = lemmatization #Done

    def process(self , text):
        # Make text lower case
        if self.convertToLowercase:
            text = text.lower()

        pattern = r"\s*\([a-zA-Z]\s_\)"
        text = re.sub(pattern, "", text)

        #Convert multiline with spaces
        text = text.replace("\n", " ")

        if self.removeSlang:
            text = contractions.fix(text)

        #Remove punctuations
        if self.removePunctuations:
            text = re.sub(r"[=.!,¿?.!+,;¿/:|%()<>।॰{}#_'\"@$^&*']", " ", text)
            text = re.sub(r"…", " ", text)

        # remove double quotes
        text = re.sub(r'"', " ", text)

        # remove numbers
        text = re.sub(r'[0-9]', "", text)
        # sentence = re.sub(r'#([^s]+)', r'1', sentence)

        # remove website links
        text = re.sub('((www.[^s]+)|(https?://[^s]+))', '', text)

        # remove multiple spaces
        text = re.sub(r'[" "]+', " ", text)

        # remove extra space
        text = text.strip()

        if self.regexList is not None:
            for regex in self.regexList:
                text = re.sub(regex, '', text)

        if self.stopwordList is not None:
            text_list = text.split()
            text_list = [word for word in text_list if word not in self.stopwordList]
            text = " ".join(text_list)

        #Stemming (convert the word into root word)
        if self.useStemming:
            ps = nltk.stem.porter.PorterStemmer()
            text_list = text.split()
            text_list = [ps.stem(word) for word in text_list]
            text = " ".join(text_list)

        #Lemmatization (convert the word into root word)
        if self.useLemmatization:
            lem = nltk.stem.wordnet.WordNetLemmatizer()
            text_list = text.split()
            text_list = [lem.lemmatize(word) for word in text_list]
            text = " ".join(text_list)

        return text

class TextSummaryDataset(Dataset):
    def __init__(self,
                 df,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation=True,
                 ):

        self.df = df
        self.textprocessor = textprocessor
        self.chapter = df["chapter"]
        self.summary = df["summary_text"]
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

    def __len__(self):
        return len(self.chapter)

    def __getitem__(self,idx):
        chapter = "summarize:" + str(self.textprocessor.process(self.chapter[idx]))
        summary = self.textprocessor.process(self.summary[idx])

        input_encodings = self.tokenizer(chapter, max_length=self.tokenizer_chapter_max_length,padding="max_length", truncation=self.truncation)

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(summary, max_length=self.tokenizer_summary_max_length,padding="max_length", truncation=self.truncation)

        return {
            "input_ids": torch.tensor(input_encodings["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(input_encodings["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(target_encodings["input_ids"], dtype=torch.long),
            "summary_mask": torch.tensor(target_encodings["attention_mask"], dtype=torch.long)
        }

class TextDataModule(L.LightningDataModule):
    def __init__(self,
                 train_path,
                 test_path,
                 val_path,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation = True,
                 batch_size: int = 32):


        super().__init__()

        # Initializing Paths
        self.train_path = train_path
        self.test_path = test_path
        self.val_path = val_path

        # Initializing Dataframes
        self.train_df = None
        self.test_df = None
        self.val_df = None

        # Textprocessor setup
        self.textprocessor = textprocessor

        # Tokenizer setup
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

        # Batch size setup
        self.batch_size = batch_size

    def prepare_data(self):
         # Reading the train file
        try:
            self.train_df = pd.read_csv(self.train_path)
        except Exception as e:
            print(f"Exception raised while reading training file at path : {self.train_path} \n Exception : {e}")

        # Reading the test file
        try:
            self.test_df = pd.read_csv(self.test_path)
        except Exception as e:
            print(f"Exception raised while reading test file at path : {self.test_path} \n Exception : {e}")

        # Reading the validation file
        try:
            self.val_df = pd.read_csv(self.val_path)
        except Exception as e:
            print(f"Exception raised while reading validation file at path : {self.val_path} \n Exception : {e}")


    def setup(self, stage= None):
        self.train_dataset = TextSummaryDataset(
            df=self.train_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.val_dataset = TextSummaryDataset(
            df=self.val_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.test_dataset = TextSummaryDataset(
            df=self.test_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=0)

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=0)

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=0
        )

class ImageDataModule(L.LightningModule):
    def __init__(self, data_dir: str = "path/to/dir", batch_size: int = 32):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def prepare_data(self):
        pass

    def setup(self, stage: str):
        ## Image Data
        pass

    def train_dataloader(self):
        return DataLoader(self.traindataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.valdataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.testdataset, batch_size=self.batch_size)

    def predict_dataloader(self):
        return DataLoader(self.predict, batch_size=self.batch_size)

In [3]:
train_df = pd.read_csv("../Datasets/Training_data.csv")
test_df = pd.read_csv("../Datasets/Testing_data.csv")
val_df = pd.read_csv("../Datasets/Validation_data.csv")
total_documents = len(train_df)+len(test_df)+len(val_df)

print(total_documents)

12515


In [4]:
class TextSummaryModel(L.LightningModule):
    def __init__(self,model,
                     epochs=2):
        super(TextSummaryModel,self).__init__()
        self.model = model


    def set_model(self,model):
        self.model = model

    def forward(self, 
                input_ids, 
                attention_mask, 
                labels = None, 
                decoder_attention_mask = None):
        
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             labels=labels,
                             decoder_attention_mask=decoder_attention_mask)

        return outputs.loss, outputs.logits

    def training_step(self,batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)

        return loss

    def validation_step(self , batch , batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)

        self.log('val_loss', loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]
        loss, output = self(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)
        return loss


    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=0.0001)
        scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0,
                num_training_steps=epochs*total_documents)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [34]:
# Model and Tokenizer Setup
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small")

# Training files setup
train_path = "../Datasets/Training_data.csv"
test_path = "../Datasets/Testing_data.csv"
val_path = "../Datasets/Validation_data.csv"


# Text Preprocessor setup
textpreprocessor = TextPreprocessing()

textmodule = TextDataModule(train_path=train_path,
                                     val_path=val_path,
                                     test_path=test_path,
                                     textprocessor=textpreprocessor,
                                     tokenizer=tokenizer,
                                     tokenizer_chapter_max_length=1024,
                                     tokenizer_summary_max_length=64,
                                     truncation=True)
textmodule.prepare_data()
textmodule.setup()




#Setting up data
batch_size = 4
chapter_length = 512
summary_length = 64
epochs = 1

log_path = "/work/LitArt/verma/lightning_logs"

checkpoint_callback = ModelCheckpoint(
    filename="{epoch}-{val_loss:.2f}",
    monitor="val_loss",
    mode="min",
    verbose=True,
    save_top_k=1,
)

trainer = L.Trainer(
    callbacks=[
                checkpoint_callback,
            ],
    max_epochs = epochs,
    accelerator="gpu",
    # max_stepxs = 5,
    devices=1,
    default_root_dir = log_path
)

model = TextSummaryModel(model=model_t5,epochs=epochs)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [35]:
trainer.fit(model, textmodule)

Missing logger folder: /work/LitArt/verma/lightning_logs/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/work/LitArt/verma/capstone/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/work/LitArt/verma/capstone/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 300: 'val_loss' reached 3.07552 (best 3.07552), saving model to '/work/LitArt/verma/lightning_logs/lightning_logs/version_0/checkpoints/epoch=0-val_loss=3.08.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=1` reached.


In [36]:
best_model_path = checkpoint_callback.best_model_path
best_model_path

'/work/LitArt/verma/lightning_logs/lightning_logs/version_0/checkpoints/epoch=0-val_loss=3.08.ckpt'

In [37]:
trainer.test(
    model=model,
    datamodule=textmodule,
    ckpt_path=best_model_path,
)

Restoring states from the checkpoint path at /work/LitArt/verma/lightning_logs/lightning_logs/version_0/checkpoints/epoch=0-val_loss=3.08.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /work/LitArt/verma/lightning_logs/lightning_logs/version_0/checkpoints/epoch=0-val_loss=3.08.ckpt
SLURM auto-requeueing enabled. Setting signal handlers.
/work/LitArt/verma/capstone/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]



[{}]

In [44]:
def summarize(text,model,tokenizer,chapter_length):
    model = model.to(device)
    inputs = tokenizer(text, 
                       max_length=chapter_length,
                       truncation=False,
                       padding="max_length",
                       add_special_tokens=True, 
                       return_tensors="pt").to(device)
    summarized_ids = model.model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"], 
        max_length= 64,        num_beams=10).to(device)

    return " ".join([tokenizer.decode(token_ids, skip_special_tokens=True)
                    for token_ids in summarized_ids])

In [45]:
text = '''
 Once upon a time, in the bustling streets of a vibrant city, there lived an elephant named Ellie. Ellie was a gentle giant most of the time, but when it came to navigating the chaotic traffic of the city, she had a bit of a temper.

One sunny afternoon, Ellie was on her way to the market to pick up some fresh fruits and vegetables. As she ambled along the crowded streets, cars honked loudly, motorbikes whizzed by, and pedestrians darted in and out of traffic. Ellie did her best to stay calm, but the constant noise and chaos were starting to get to her.

Suddenly, a small car cut her off, nearly brushing against her side. Ellie let out a loud trumpet of annoyance, her patience wearing thin. The driver of the car, a young man with a careless smirk on his face, paid no heed to Ellie's displeasure.

Feeling indignant, Ellie decided she'd had enough. With a determined glint in her eye, she carefully maneuvered herself around the car, her massive body blocking its path. The driver's smirk quickly faded as he realized he was at the mercy of an angry elephant.

"Hey, watch where you're going, you big oaf!" the driver shouted, pounding his fists against the steering wheel.

But Ellie was unfazed. She stood her ground, refusing to move until the driver offered a sincere apology. Traffic came to a standstill as bystanders watched in amazement at the spectacle unfolding before them.

After a few tense moments, the driver begrudgingly muttered an apology, his face flushed with embarrassment. Ellie nodded her head in acceptance and gracefully stepped aside, allowing the traffic to flow once again.

As Ellie continued on her way to the market, she couldn't help but feel a sense of satisfaction. Sometimes, a little road rage was necessary to remind others to show respect and courtesy, even to the largest of creatures. And from that day on, drivers in the city made sure to give Ellie plenty of space on the road, knowing that they wouldn't want to incur the wrath of an elephant again.
'''

In [46]:
summarize(text,model,tokenizer,chapter_length=chapter_length)

'that Ellie was a gentle giant most of the time, but when it came to navigating the chaotic traffic of the city, there was an elephant named Ellie. Ellie was a gentle giant most of the time, but when it came to navigating the chaotic traffic of the city,'

In [47]:
best_model_path

'/work/LitArt/verma/lightning_logs/lightning_logs/version_0/checkpoints/epoch=0-val_loss=3.08.ckpt'

In [48]:
model_t5 = model_t5.to(device)

In [49]:
news_summary_model = TextSummaryModel.load_from_checkpoint(checkpoint_path=best_model_path,model=model_t5)
news_summary_model.to(device)
news_summary_model.freeze()

In [50]:
summarize(text,news_summary_model,tokenizer,chapter_length=chapter_length)

'that Ellie was a gentle giant most of the time, but when it came to navigating the chaotic traffic of the city, there was an elephant named Ellie. Ellie was a gentle giant most of the time, but when it came to navigating the chaotic traffic of the city,'