In [1]:
import os

In [2]:
%pwd

'e:\\Projects for portfolio\\Exoplanet Chatbot\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'e:\\Projects for portfolio\\Exoplanet Chatbot'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig: # defined for the config components present in artifacts for model training
    root_dir : Path 
    data_path : Path
    tokenizer_ckpt : Path
    model_ckpt : Path
    model_save_path: Path
    tokenizer_save_path: Path
    warmup_steps: int
    per_device_train_batch_size: int
    gradient_accumulation_steps: int
    max_steps: int 
    learning_rate: float
    logging_steps: int
    output_dir: str
    optim: str
    save_strategy: str
    r: int
    lora_alpha: int
    lora_dropout: int
    bias: int
    task_type: int
    load_in_4bit: bool
    bnb_4bit_use_double_quant: bool
    bnb_4bit_quant_type: str

In [6]:
# Configuration manager
from exoplanet_chatbot.constants import *
from exoplanet_chatbot.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
    # Here we are reading the yaml file and we can now use the file paths and parameter values present inside pararms and config.yaml        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) # Here we are calling the artifacts_root key values using '.' , which was the purpose of @ensure_annotations

    def get_model_trainer_config(self) -> ModelTrainerConfig:

        config= self.config.model_trainer # Calling the model_trainer dictionary created in config.yaml file
        params_training=self.params.TrainingArguments # Calling the TrainingArguments dictionary in params.yaml file
        params_lora = self.params.LoraConfig # Calling the Lora Config dictionary in params.yaml file
        params_bnb = self.params.BitsandBytesConfig # Calling the BitsandBytesConfig dictionary in params.yaml file

        create_directories([config.root_dir]) # Creating a directory using the root directory

        model_trainer_config = ModelTrainerConfig( # Extracting the values from the config.yaml to here inside data_ingestion_config
            #Config parameters
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_ckpt=config.tokenizer_ckpt,
            model_ckpt=config.model_ckpt,
            model_save_path=config.model_save_path,
            tokenizer_save_path=config.tokenizer_save_path,

            #Training parameters
            warmup_steps=params_training.warmup_steps,
            per_device_train_batch_size=params_training.per_device_train_batch_size,
            gradient_accumulation_steps=params_training.gradient_accumulation_steps,
            max_steps=params_training.max_steps,
            learning_rate=params_training.learning_rate,
            logging_steps=params_training.logging_steps,
            output_dir=params_training.output_dir,
            optim=params_training.optim,
            save_strategy=params_training.save_strategy,

            #Lora parameters
            r=params_lora.r,
            lora_alpha=params_lora.lora_alpha,
            lora_dropout=params_lora.lora_dropout,
            bias=params_lora.bias,
            task_type=params_lora.task_type,

            #Bits and bytes Configuration
            load_in_4bit=params_bnb.load_in_4bit,
            bnb_4bit_use_double_quant=params_bnb.bnb_4bit_use_double_quant,
            bnb_4bit_quant_type=params_bnb.bnb_4bit_quant_type
        )

        return model_trainer_config

In [7]:
# Model Trainer
import pandas as pd
import torch
from trl import SFTTrainer
from datasets import Dataset
import bitsandbytes as bnb
from transformers import TrainingArguments, TrainerCallback, DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

class DebugCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        print(f"Step: {state.global_step}, Loss: {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")

class ModelTrainer:
    def __init__(self,config: ModelTrainerConfig):
        self.config = config
        self.bnb_config = BitsAndBytesConfig(load_in_4bit=self.config.load_in_4bit, bnb_4bit_use_double_quant=self.config.bnb_4bit_use_double_quant, 
                                             bnb_4bit_quant_type=self.config.bnb_4bit_quant_type, bnb_4bit_compute_dtype=torch.bfloat16)
        self.model = AutoModelForCausalLM.from_pretrained(self.config.model_ckpt, quantization_config=self.bnb_config, device_map={"":0})
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_ckpt)
    
    # Dataset Train Test Split
    def train_test_split(self,data):

        data = data.train_test_split(test_size=0.2)
        train_data = data["train"]
        test_data = data["test"]

        return (train_data,test_data)

    # Dataset Creating and Tokenization for Finetuning
    def transform_and_tokenize(self):

        # Loading the finetuning dataset
        finetune_dataframe = pd.read_csv(self.config.data_path)

        # Converting the dataframe to dataset
        finetune_dataset = Dataset.from_pandas(finetune_dataframe)

        # Shuffling and Tokenization
        finetune_dataset = finetune_dataset.shuffle(seed=1234)  # Shuffle dataset here
        finetune_dataset = finetune_dataset.map(lambda samples: self.tokenizer(samples["prompt"]), batched=True)

        # Train Test Split of Dataset
        train_dataset,test_dataset = self.train_test_split(finetune_dataset)

        return (train_dataset,test_dataset)
    
    # Function for preparing the linear layers for training in LoRa
    def find_all_linear_names(self,model):
        cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
        lora_module_names = set()
        for name, module in model.named_modules():
            if isinstance(module, cls):
                names = name.split('.')
                lora_module_names.add(names[0] if len(names) == 1 else names[-1])
            if 'lm_head' in lora_module_names: # needed for 16-bit
                lora_module_names.remove('lm_head')
        return list(lora_module_names)

    # LoRa Configuration for training
    def Lora_config(self):

        self.model.gradient_checkpointing_enable()
        self.model = prepare_model_for_kbit_training(self.model)

        modules = self.find_all_linear_names(self.model)

        lora_config = LoraConfig(
            r=self.config.r, #  Always keep it 2 times the lora_alpha
            lora_alpha=self.config.lora_alpha,
            target_modules=modules,
            lora_dropout=self.config.lora_dropout,
            bias=self.config.bias,
            task_type=self.config.task_type
        )

        self.model = get_peft_model(self.model, lora_config)

        return (self.model,lora_config)

    def Model_Config(self):

        training_args = TrainingArguments(
            warmup_steps = self.config.warmup_steps,
            per_device_train_batch_size = self.config.per_device_train_batch_size,
            gradient_accumulation_steps = self.config.gradient_accumulation_steps,
            max_steps = self.config.max_steps,
            learning_rate = float(self.config.learning_rate),
            logging_steps = self.config.logging_steps,
            output_dir = self.config.output_dir,
            optim = self.config.optim,
            save_strategy = self.config.save_strategy
        )

        return training_args
    
    def tokenizer_and_model_save(self,model):

        base_model = AutoModelForCausalLM.from_pretrained(
            self.config.model_ckpt,
            low_cpu_mem_usage=True,
            return_dict=True,
            torch_dtype=torch.float16,
            device_map={"": 0},
        )
        merged_model= PeftModel.from_pretrained(base_model, model)
        merged_model= merged_model.merge_and_unload()

        # Save the merged model
        merged_model.save_pretrained(self.config.model_save_path,safe_serialization=True)
        self.tokenizer.save_pretrained(self.config.tokenizer_save_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

    def train(self):

        self.tokenizer.pad_token = self.tokenizer.eos_token
        torch.cuda.empty_cache()

        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Loading the tokenized datasets
        train_dataset_tokenized , validation_dataset_tokenized = self.transform_and_tokenize()

        # Loading the LoRa configured model
        model,lora_config = self.Lora_config()

        # Setting the Trainer       
        trainer = SFTTrainer(
            model=model,
            train_dataset=train_dataset_tokenized,
            eval_dataset=validation_dataset_tokenized,
            dataset_text_field="prompt",
            peft_config=lora_config,
            args=self.Model_Config(),
            data_collator=DataCollatorForLanguageModeling(self.tokenizer, mlm=False),callbacks=[DebugCallback()]
        )

        # Training the model
        model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
        trainer.train()

        #Saving the tokenizer and Model
        new_model = "gemma-Exochat-Instruct-Finetune-Step10"
        trainer.model.save_pretrained(new_model)
        self.tokenizer_and_model_save(new_model)


[2024-06-16 23:40:48,218: INFO: config: PyTorch version 2.2.2+cu121 available.]
[2024-06-16 23:40:48,220: INFO: config: TensorFlow version 2.16.1 available.]


In [8]:
#Pipeline
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config() # Storing the configuration
    model_training = ModelTrainer(config=model_trainer_config) # Using the configuration saved earlier to call model_training
    model_training.train()
except Exception as e:
    raise e

[2024-06-16 23:40:48,782: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-16 23:40:48,788: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-16 23:40:48,789: INFO: common: created directory at: artifacts]
[2024-06-16 23:40:48,791: INFO: common: created directory at: artifacts/model_trainer]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/101484 [00:00<?, ? examples/s]



Map:   0%|          | 0/81187 [00:00<?, ? examples/s]

Map:   0%|          | 0/20297 [00:00<?, ? examples/s]



  0%|          | 0/10 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step: 1, Loss: N/A
{'loss': 3.386, 'grad_norm': 2.501392364501953, 'learning_rate': 0.0002, 'epoch': 0.0}
Step: 2, Loss: 3.386
{'loss': 3.5357, 'grad_norm': 2.5799057483673096, 'learning_rate': 0.00017777777777777779, 'epoch': 0.0}
Step: 3, Loss: 3.5357
{'loss': 2.5002, 'grad_norm': 1.47601318359375, 'learning_rate': 0.00015555555555555556, 'epoch': 0.0}
Step: 4, Loss: 2.5002
{'loss': 2.0814, 'grad_norm': 1.46877121925354, 'learning_rate': 0.00013333333333333334, 'epoch': 0.0}
Step: 5, Loss: 2.0814
{'loss': 1.7645, 'grad_norm': 1.7003042697906494, 'learning_rate': 0.00011111111111111112, 'epoch': 0.0}
Step: 6, Loss: 1.7645
{'loss': 1.5005, 'grad_norm': 1.3736467361450195, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.0}
Step: 7, Loss: 1.5005
{'loss': 1.3533, 'grad_norm': 1.3190935850143433, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.0}
Step: 8, Loss: 1.3533
{'loss': 1.1126, 'grad_norm': 1.1966962814331055, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.0}
Step: 9, Los

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]