In [1]:
import dataset
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import lightning as L
from data_preprocessor import TextPreprocessing
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import get_linear_schedule_with_warmup, AdamW
import re

device = "cuda" if torch.cuda.is_available() else "cpu"

class TextPreprocessing():
    def __init__(self ,
                 regexList = None,
                 punct= True,
                 lowercase= True,
                 slang= False,
                 stopwordList = None,
                 stemming = False,
                 lemmatization= False ):

        self.convertToLowercase = lowercase #Done
        self.removePunctuations = punct #Done
        self.regexList = regexList  # Done
        self.removeSlang = slang #Done
        self.stopwordList = stopwordList #Done
        self.useStemming = stemming #Done
        self.useLemmatization = lemmatization #Done

    def process(self , text):
        # Make text lower case
        if self.convertToLowercase:
            text = text.lower()

        pattern = r"\s*\([a-zA-Z]\s_\)"
        text = re.sub(pattern, "", text)

        #Convert multiline with spaces
        text = text.replace("\n", " ")

        if self.removeSlang:
            text = contractions.fix(text)

        #Remove punctuations
        if self.removePunctuations:
            text = re.sub(r"[=.!,¿?.!+,;¿/:|%()<>।॰{}#_'\"@$^&*']", " ", text)
            text = re.sub(r"…", " ", text)

        # remove double quotes
        text = re.sub(r'"', " ", text)

        # remove numbers
        text = re.sub(r'[0-9]', "", text)
        # sentence = re.sub(r'#([^s]+)', r'1', sentence)

        # remove website links
        text = re.sub('((www.[^s]+)|(https?://[^s]+))', '', text)

        # remove multiple spaces
        text = re.sub(r'[" "]+', " ", text)

        # remove extra space
        text = text.strip()

        if self.regexList is not None:
            for regex in self.regexList:
                text = re.sub(regex, '', text)

        if self.stopwordList is not None:
            text_list = text.split()
            text_list = [word for word in text_list if word not in self.stopwordList]
            text = " ".join(text_list)

        #Stemming (convert the word into root word)
        if self.useStemming:
            ps = nltk.stem.porter.PorterStemmer()
            text_list = text.split()
            text_list = [ps.stem(word) for word in text_list]
            text = " ".join(text_list)

        #Lemmatization (convert the word into root word)
        if self.useLemmatization:
            lem = nltk.stem.wordnet.WordNetLemmatizer()
            text_list = text.split()
            text_list = [lem.lemmatize(word) for word in text_list]
            text = " ".join(text_list)

        return text

class TextSummaryDataset(Dataset):
    def __init__(self,
                 df,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation=True,
                 ):

        self.df = df
        self.textprocessor = textprocessor
        self.chapter = df["chapter"]
        self.summary = df["summary_text"]
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

    def __len__(self):
        return len(self.chapter)

    def __getitem__(self,idx):
        chapter = "summarize:" + str(self.textprocessor.process(self.chapter[idx]))
        summary = self.textprocessor.process(self.summary[idx])

        input_encodings = self.tokenizer(chapter, max_length=self.tokenizer_chapter_max_length,padding="max_length", truncation=self.truncation)

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(summary, max_length=self.tokenizer_summary_max_length,padding="max_length", truncation=self.truncation)

        return {
            "input_ids": torch.tensor(input_encodings["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(input_encodings["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(target_encodings["input_ids"], dtype=torch.long),
            "summary_mask": torch.tensor(target_encodings["attention_mask"], dtype=torch.long)
        }

class TextDataModule(L.LightningDataModule):
    def __init__(self,
                 train_path,
                 test_path,
                 val_path,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation = True,
                 batch_size: int = 32):


        super().__init__()

        # Reading the train file
        try:
            self.train_df = pd.read_csv(train_path)
        except Exception as e:
            print(f"Exception raised while reading training file at path : {train_path} \n Exception : {e}")

        # Reading the test file
        try:
            self.test_df = pd.read_csv(test_path)
        except Exception as e:
            print(f"Exception raised while reading test file at path : {test_path} \n Exception : {e}")

        # Reading the validation file
        try:
            self.val_df = pd.read_csv(val_path)
        except Exception as e:
            print(f"Exception raised while reading validation file at path : {val_path} \n Exception : {e}")

        # Textprocessor setup
        self.textprocessor = textprocessor

        # Tokenizer setup
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

        # Batch size setup
        self.batch_size = batch_size


    def setup(self, stage= None):
        self.train_dataset = TextSummaryDataset(
            df=self.train_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.val_dataset = TextSummaryDataset(
            df=self.val_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.test_dataset = TextSummaryDataset(
            df=self.test_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=0)

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=0)

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=0
        )

class ImageDataModule(L.LightningModule):
    def __init__(self, data_dir: str = "path/to/dir", batch_size: int = 32):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size

    def prepare_data(self):
        pass

    def setup(self, stage: str):
        ## Image Data
        pass

    def train_dataloader(self):
        return DataLoader(self.traindataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.valdataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.testdataset, batch_size=self.batch_size)

    def predict_dataloader(self):
        return DataLoader(self.predict, batch_size=self.batch_size)

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
train_df = pd.read_csv("../Datasets/Training_data.csv")
test_df = pd.read_csv("../Datasets/Testing_data.csv")
val_df = pd.read_csv("../Datasets/Validation_data.csv")
total_documents = len(train_df)+len(test_df)+len(val_df)
print(total_documents)

12515


In [3]:
class TextSummaryModel(L.LightningModule):
    def __init__(self,model,
                     epochs=2):
        super(TextSummaryModel,self).__init__()
        self.model = model

    def forward(self, 
                input_ids, 
                attention_mask, 
                labels = None, 
                decoder_attention_mask = None):
        
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             labels=labels,
                             decoder_attention_mask=decoder_attention_mask)

        return outputs.loss, outputs.logits

    def training_step(self,batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)

        return loss

    def validation_step(self , batch , batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        loss, output = self(input_ids=input_ids, 
                            attention_mask=attention_mask)
        return loss


    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=0.0001)
        scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0,
                num_training_steps=epochs*total_documents)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [4]:
# Model and Tokenizer Setup
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small")

# Training files setup
train_path = "../Datasets/Training_data.csv"
test_path = "../Datasets/Testing_data.csv"
val_path = "../Datasets/Validation_data.csv"


# Text Preprocessor setup
textpreprocessor = TextPreprocessing()

textmodule = TextDataModule(train_path=train_path,
                                     val_path=val_path,
                                     test_path=test_path,
                                     textprocessor=textpreprocessor,
                                     tokenizer=tokenizer,
                                     tokenizer_chapter_max_length=1024,
                                     tokenizer_summary_max_length=64,
                                     truncation=True)
textmodule.setup()




#Setting up data
batch_size = 4
chapter_length = 512
summary_length = 64
epochs = 2

trainer = L.Trainer(
    max_epochs = epochs,
    accelerator="gpu",
    devices=1
)

model = TextSummaryModel(model=model_t5,epochs=epochs)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/work/LitArt/verma/capstone/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts 

In [6]:
trainer.fit(model, textmodule)

RuntimeError: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.