## Model Training

In [1]:
1+1

2

In [2]:
import os

%pwd

'/Users/satwiksahoo/Desktop/CodeBasics/machine learning/krish naik/NLP project/text_summarizer/research'

In [3]:
os.chdir('../')
%pwd

'/Users/satwiksahoo/Desktop/CodeBasics/machine learning/krish naik/NLP project/text_summarizer'

In [26]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class ModelTrainerConfig:
    root_dir : Path
    data_path_train : Path
    data_path_test : Path
    data_path_validation : Path
    model_ckpt : Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    evaluation_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

        

In [27]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml , create_directories

class ConfigurationManager:
    def __init__(self , config_file_path = CONFIG_FILE_PATH , params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifacts_root]) 
    
    def get_model_trainer(self) -> ModelTrainerConfig:
        
        config = self.config.model_trainer
        create_directories([config.root_dir])
        
        
        model_trainer_config = ModelTrainerConfig(
       
            
            root_dir = config.root_dir,
           
            data_path_train = config.data_path_train,
            data_path_test  = config.data_path_test ,
            data_path_validation = config.data_path_validation ,
            
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs ,
            warmup_steps = params.warmup_steps ,
            per_device_train_batch_size = params.per_device_train_batch_size ,
            weight_decay = params.weight_decay, 
            logging_steps = params.logging_steps ,
            evaluation_strategy = params.evaluation_strategy ,
            eval_steps = params.eval_steps ,
            save_steps = params.save_steps ,
            gradient_accumulation_steps = params.gradient_accumulation_steps
            
            
        )  
        
        
        return model_trainer_config
        

In [29]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from src.textSummarizer.logging import logger
import os
from datasets import load_from_disk 
import pandas as pd
import re
from datasets import Dataset

class ModelTrainer:
    def __init__(self , config : ModelTrainerConfig):
        self.config = config 
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name) 
    
    
    def preprocess_fun(example):
        input = tokenizer(example['preprocess_text'] , padding = 'max_length' ,truncation = True , max_length=512)
        target = tokenizer(example['summary'] , padding = 'max_length' ,truncation = True , max_length=150)
        input['labels'] = target['input_ids']

        return input
    
    def model_training(self):
        
        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model =     AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt)
        
        dataset = load_from_disk(self.config.data_path)
        
        training_args = TrainingArguments(
          output_dir='./results',
          num_train_epochs=self.params.num_train_epochs,
          per_device_train_batch_size=self.params.per_device_train_batch_size,     # smallest batch size
          per_device_eval_batch_size=self.params.per_device_eval_batch_size,
          weight_decay=self.params.weight_decay,
          logging_steps=self.params.logging_steps,
          eval_steps=self.params.eval_steps,
          save_steps=self.params.save_steps,
          gradient_accumulation_steps=self.params.gradient_accumulation_steps,     # no accumulation
          eval_strategy=self.params.eval_strategy,
          save_total_limit=self.params.save_total_limit,
          logging_dir='./logs',
    # Uncomment below to force CPU if MPS still crashes
    # no_cuda=True,
             )
        
        
        
        trainer = Trainer(
              model=model,
              args=training_args,
              train_dataset=pd.DataFrame(load_from_disk(self.config.data_path_train)),
              eval_dataset=pd.DataFrame(load_from_disk(self.config.data_path_validation))
             )
        
        trainer.train()
        
        model.save_pretrained(os.path.join(self.config.root_dir , 'bart_model'))
        
        tokenizer.save_pretrained(os.path.join(self.config.root_dir , 'tokenizer'))
        
        
        

In [28]:
from datasets import load_from_disk 
import pandas as pd
dataset = load_from_disk('artifacts/data_transformation/test_dataset')
pd.DataFrame(dataset)


Unnamed: 0,id,dialogue,summary,preprocess_text
0,13820547,Olafur: are we doing anything for New Year's E...,"Nathalie, Olafur and Zoe are planning the New ...",olafur: are we doing anything for new year's e...
1,13682134,Javier: Hey do you know any tattoo parlors ove...,Javier was initially eager to have a tatoo don...,javier: hey do you know any tattoo parlors ove...
2,13611508,"Martha: Hey, can I ask you a question?\r\nOphe...",Martha likes Ophelia's lenses and wants to buy...,"martha: hey, can i ask you a question? ophelia..."
3,13829744,"Miranda: Hi S, could we cancel tomorrow's meet...",Miranda can't make her meeting with Stephanie ...,"miranda: hi s, could we cancel tomorrow's meet..."
4,13864860,Sam: Where are you?\nKate: downstairs\nSam: al...,Kate and Jeff are downstairs in a room next to...,sam: where are you? kate: downstairs sam: alre...
...,...,...,...,...
595,13681079,Dinny: can you take your dog away before i com...,Dinny's afraid of Terry's dog so he should kee...,dinny: can you take your dog away before i com...
596,13716791,"Lauren: ladies, i'm thinking of getting a tatt...",Lauren want's to have a small tattoo above her...,"lauren: ladies, i'm thinking of getting a tatt..."
597,13681192,Crystal: <file_photo>\r\nIrene: He's so big!\r...,Irene will take Crystal's son shopping for clo...,crystal: irene: he's so big! crystal: crys...
598,13682321,Kate: I've just heard you want to sell your fl...,"Rob wants to sell his flat, because it's too s...",kate: i've just heard you want to sell your fl...


In [23]:
dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'preprocess_text'],
    num_rows: 600
})

In [None]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer()
model_trainer = ModelTrainer(config = model_trainer_config)
model_trainer.train()
