In [1]:
import sys
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# append a new directory to sys.path
# sys.path.append(os.path.dirname(os.path.dirname(__file__)))
sys.path.append('/home/verma.shi/LLM/LitArt/data_module')
sys.path.append('/home/verma.shi/LLM/LitArt/models')

import argparse
import time

import torch
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint,EarlyStopping

import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline, set_seed
from transformers import get_linear_schedule_with_warmup, AdamW
from lightning.pytorch.loggers import TensorBoardLogger

import warnings
warnings.filterwarnings("ignore")

torch.set_float32_matmul_precision('medium')
torch.cuda.empty_cache()




In [2]:
import os
import pandas as pd
import torch
from torchvision.io import read_image
from torch.utils.data import Dataset

class TextSummaryDataset(Dataset):
    def __init__(self,
                 df,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation=True,
                 ):

        self.df = df
        self.textprocessor = textprocessor
        self.chapter = df["chapter"]
        self.summary = df["summary_text"]
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

    def __len__(self):
        return len(self.chapter)

    def __getitem__(self,idx):
        chapter = "Summarize the following : \n" + str(self.textprocessor.process(self.chapter[idx])) + "\n\nSummary:"
        summary = self.textprocessor.process(self.summary[idx])

        input_encodings = self.tokenizer(chapter, max_length=self.tokenizer_chapter_max_length,padding="max_length", truncation=self.truncation)

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(summary, max_length=self.tokenizer_summary_max_length,padding="max_length", truncation=self.truncation)

        return {
            "input_ids": torch.tensor(input_encodings["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(input_encodings["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(target_encodings["input_ids"], dtype=torch.long),
            "summary_mask": torch.tensor(target_encodings["attention_mask"], dtype=torch.long)
        }

In [3]:
import sys
import os
# append a new directory to sys.path
# sys.path.append(os.path.dirname(os.path.dirname(__file__)))
sys.path.append('/home/verma.shi/LLM/LitArt/data_module')

import glob
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler
import pandas as pd
import lightning as L
from data_preprocessor import TextPreprocessing
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
device = "cuda" if torch.cuda.is_available() else "cpu"

class TextDataModule(L.LightningDataModule):
    def __init__(self,
                 train_path,
                 test_path,
                 val_path,
                 textprocessor,
                 tokenizer,
                 tokenizer_chapter_max_length=1024,
                 tokenizer_summary_max_length=64,
                 truncation = True,
                 batch_size: int = 32):


        super().__init__()

        # Initializing Paths
        self.train_path = train_path
        self.test_path = test_path
        self.val_path = val_path

        # Initializing Dataframes
        self.train_df = None
        self.test_df = None
        self.val_df = None

        # Textprocessor setup
        self.textprocessor = textprocessor

        # Tokenizer setup
        self.tokenizer = tokenizer
        self.tokenizer_chapter_max_length = tokenizer_chapter_max_length
        self.tokenizer_summary_max_length = tokenizer_summary_max_length
        self.truncation = truncation

        # Batch size setup
        self.batch_size = batch_size

    def prepare_data(self):
         # Reading the train file
        try:
            self.train_df = pd.read_csv(self.train_path)
        except Exception as e:
            print(f"Exception raised while reading training file at path : {self.train_path} \n Exception : {e}")

        # Reading the test file
        try:
            self.test_df = pd.read_csv(self.test_path)
        except Exception as e:
            print(f"Exception raised while reading test file at path : {self.test_path} \n Exception : {e}")

        # Reading the validation file
        try:
            self.val_df = pd.read_csv(self.val_path)
        except Exception as e:
            print(f"Exception raised while reading validation file at path : {self.val_path} \n Exception : {e}")

    def total_documents(self):
        
        total_documents = self.train_df.shape[0] + self.test_df.shape[0] + self.val_df.shape[0]

        return total_documents


    def setup(self, stage= None):
        self.train_dataset = TextSummaryDataset(
            df=self.train_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.val_dataset = TextSummaryDataset(
            df=self.val_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

        self.test_dataset = TextSummaryDataset(
            df=self.test_df,
            textprocessor=self.textprocessor,
            tokenizer=self.tokenizer,
            tokenizer_chapter_max_length=self.tokenizer_chapter_max_length,
            tokenizer_summary_max_length=self.tokenizer_summary_max_length,
            truncation=self.truncation)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=0)

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=0)

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            num_workers=0
        )

In [4]:

class TextSummaryModel(L.LightningModule):
    def __init__(self,model,
                     total_documents = 5000,
                     epochs=2):
        super(TextSummaryModel,self).__init__()
        self.model = model
        self.epochs = int(epochs)
        self.total_documents = int(total_documents)


    def set_model(self,model):
        self.model = model

    def forward(self, 
                input_ids, 
                attention_mask, 
                labels = None, 
                decoder_attention_mask = None):
        
        outputs = self.model(input_ids=input_ids,
                             attention_mask=attention_mask,
                             labels=labels,
                             decoder_attention_mask=decoder_attention_mask)

        return outputs.loss, outputs.logits

    def training_step(self,batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)

        self.log('train_loss', loss, prog_bar=True)

        return loss

    def validation_step(self , batch , batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]

        loss , output = self(input_ids = input_ids,
                            attention_mask = attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)

        self.log('val_loss', loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["summary_mask"]
        loss, output = self(input_ids=input_ids, 
                            attention_mask=attention_mask,
                            labels = labels,
                            decoder_attention_mask = decoder_attention_mask)
        return loss


    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=0.0001)
        # scheduler = get_linear_schedule_with_warmup(
        #         optimizer, num_warmup_steps=100,
        #         num_training_steps=self.epochs*self.total_documents)
        # return {'optimizer': optimizer, 'lr_scheduler': scheduler}
        return {'optimizer': optimizer}

In [None]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sat Mar  2 19:24:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:18:00.0 Off |                    0 |
| N/A   49C    P0              63W / 300W |  31741MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    