In [None]:
import sys
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# append a new directory to sys.path
# sys.path.append(os.path.dirname(os.path.dirname(__file__)))
sys.path.append('/home/verma.shi/LLM/LitArt/data_module')
sys.path.append('/home/verma.shi/LLM/LitArt/models')

import argparse
import time

import torch
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint,EarlyStopping

import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline, set_seed
from transformers import get_linear_schedule_with_warmup, AdamW

import warnings
warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision('medium')
torch.cuda.empty_cache()




In [None]:
import os
import pandas as pd
import torch
from torchvision.io import read_image
from torch.utils.data import Dataset

class TextSummaryDataset(Dataset):
    def __init__(self,
                 df,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation=True,
                 ):

        self.df = df
        self.textprocessor = textprocessor
        self.chapter = df["chapter"]
        self.summary = df["summary_text"]
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

    def __len__(self):
        return len(self.chapter)

    def __getitem__(self,idx):
        chapter = "Summarize the following : \n" + str(self.textprocessor.process(self.chapter[idx])) + "\n\nSummary:"
        summary = self.textprocessor.process(self.summary[idx])

        input_encodings = self.tokenizer(chapter, max_length=self.tokenizer_chapter_max_length,padding="max_length", truncation=self.truncation)

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(summary, max_length=self.tokenizer_summary_max_length,padding="max_length", truncation=self.truncation)

        return {
            "input_ids": torch.tensor(input_encodings["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(input_encodings["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(target_encodings["input_ids"], dtype=torch.long),
            "summary_mask": torch.tensor(target_encodings["attention_mask"], dtype=torch.long)
        }

In [None]:
import sys
import os
# append a new directory to sys.path
# sys.path.append(os.path.dirname(os.path.dirname(__file__)))
sys.path.append('/home/verma.shi/LLM/LitArt/data_module')

import glob
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import lightning as L
from data_preprocessor import TextPreprocessing
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"

class TextDataModule(L.LightningDataModule):
    def __init__(self,
                 train_path,
                 test_path,
                 val_path,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation = True,
                 batch_size: int = 32):


        super().__init__()

        # Initializing Paths
        self.train_path = train_path
        self.test_path = test_path
        self.val_path = val_path

        # Initializing Dataframes
        self.train_df = None
        self.test_df = None
        self.val_df = None

        # Textprocessor setup
        self.textprocessor = textprocessor

        # Tokenizer setup
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

        # Batch size setup
        self.batch_size = batch_size

    def prepare_data(self):
         # Reading the train file
        try:
            self.train_df = pd.read_csv(self.train_path)
        except Exception as e:
            print(f"Exception raised while reading training file at path : {self.train_path} \n Exception : {e}")

        # Reading the test file
        try:
            self.test_df = pd.read_csv(self.test_path)
        except Exception as e:
            print(f"Exception raised while reading test file at path : {self.test_path} \n Exception : {e}")

        # Reading the validation file
        try:
            self.val_df = pd.read_csv(self.val_path)
        except Exception as e:
            print(f"Exception raised while reading validation file at path : {self.val_path} \n Exception : {e}")

    def total_documents(self):
        
        total_documents = self.train_df.shape[0] + self.test_df.shape[0] + self.val_df.shape[0]

        return total_documents


    def setup(self, stage= None):
        self.train_dataset = TextSummaryDataset(
            df=self.train_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.val_dataset = TextSummaryDataset(
            df=self.val_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.test_dataset = TextSummaryDataset(
            df=self.test_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=0)

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=0)

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=0
        )

In [None]:

class TextSummaryModel(L.LightningModule):
    def __init__(self,model,
                     total_documents = 5000,
                     epochs=2):
        super(TextSummaryModel,self).__init__()
        self.model = model
        self.epochs = int(epochs)
        self.total_documents = int(total_documents)


    def set_model(self,model):
        self.model = model

    def forward(self, 
                input_ids, 
                attention_mask, 
                labels = None, 
                decoder_attention_mask = None):
        
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             labels=labels,
                             decoder_attention_mask=decoder_attention_mask)

        return outputs.loss, outputs.logits

    def training_step(self,batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)

        return loss

    def validation_step(self , batch , batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)

        self.log('val_loss', loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]
        loss, output = self(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)
        return loss


    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=0.0001)
        scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=500,
                num_training_steps=self.epochs*self.total_documents)
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

#Loading the data
train_path = "/work/LitArt/data/generated_summaries/train_dataset_with_summaries.csv"
test_path = "/work/LitArt/data/generated_summaries/test_dataset_with_summaries.csv"
val_path = "/work/LitArt/data/generated_summaries/validation_dataset_with_summaries.csv"


#Loading the model and tokenizer
base_model_name = "google/flan-t5-base"
tokenizer_name = "google/flan-t5-base"
cache_dir = "/work/LitArt/cache"
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name,cache_dir=cache_dir).to(device)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,cache_dir=cache_dir)

#Initializing the dataloaders
textpreprocessor = TextPreprocessing()
textmodule = TextDataModule(train_path=train_path,
                                    val_path=val_path,
                                    test_path=test_path,
                                    textprocessor=textpreprocessor,
                                    tokenizer=tokenizer,
                                    tokenizer_chapter_max_length=1024,
                                    tokenizer_summary_max_length=64,
                                    truncation=True)

textmodule.prepare_data()
textmodule.setup()
total_documents = textmodule.total_documents()

#Model Parameters
batch_size = 64
chapter_length = 512
summary_length = 64
epochs = 1
log_path =  "/work/LitArt/verma/lightning_logs"

checkpoint_callback = ModelCheckpoint(
    filename="{epoch}-{val_loss:.2f}",
    monitor="val_loss",
    mode="min",
    verbose=True,
    save_top_k=1,
)

trainer = L.Trainer(
    callbacks=[
                checkpoint_callback,
            ],
    max_epochs = epochs,
    accelerator="gpu",
    devices=1,
    default_root_dir = log_path
)


#Loading the model
model = TextSummaryModel(model=base_model,epochs=epochs,total_documents=total_documents)

#Fitting the model
trainer.fit(model, textmodule)

best_model_path = checkpoint_callback.best_model_path
print(f'Best Model Path = {best_model_path}')

In [1]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("/work/LitArt/data/generated_summaries/train_dataset_with_summaries.csv")

In [6]:
df

Unnamed: 0,chapter,summary_text,__index_level_0__,generated_summary
0,one evening as i was lying flat on the deck of...,from the deck of his steamboat marlow overhear...,5664,"On a steamboat, the narrator overhears a conve..."
1,the river--seemed to beckon with a dishonourin...,from the deck of his steamboat marlow overhear...,5664,"Narrator travels up a mysterious river, encoun..."
2,air for a while--and on we went again into the...,from the deck of his steamboat marlow overhear...,5664,The passage describes a journey through a dens...
3,instructed and what he knew was this--that sho...,from the deck of his steamboat marlow overhear...,5664,"A fireman, fearing an evil spirit in the boile..."
4,we were till next morning moreover he pointed ...,from the deck of his steamboat marlow overhear...,5664,"Delayed by sensible caution, the narrator navi..."
...,...,...,...,...
10663,so in this case you must have made extensive c...,isabel decides that no harm can come to her fr...,9514,Countess and Madame Merle discuss Isabel Arche...
10664,if miss archer s to become her mother it s sur...,isabel decides that no harm can come to her fr...,9514,The Countess and Madame Merle discuss potentia...
10665,actus quartus enter one of the frenchmen with ...,one of the french lords and a band of soldiers...,8794,French soldiers plan to ambush Lord E. They us...
10666,true what is not holie that we sweare not by b...,one of the french lords and a band of soldiers...,8794,"A person questions swearing by unholy things, ..."
