In [1]:
import os
os.chdir("../")

In [7]:
from cvClassifier import logger
from cvClassifier.utils.common import get_size, read_yaml, create_directories 
from cvClassifier.constants import *

In [55]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir: Path
    updated_base_model_path: Path
    training_data_path: Path
    validation_data_path: Path
    trained_model_path: Path
    params_epochs: int
    params_batch_size: int
    params_is_augmentation: bool
    params_image_size: int
    params_learning_rate: float


In [None]:
class ConfigurationManager:
    # this class manages the configuration of the model preparation pipeline

    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainingConfig:
        ''' Gets the config details for the model training pipeline '''
        config = self.config.model_training
        params = self.params
        
        create_directories([config.root_dir])

        model_training_config = ModelTrainingConfig(
            root_dir = config.root_dir,
            updated_base_model_path = config.updated_base_model_path,
            training_data_path = config.training_data,
            validation_data_path = config.validation_data,
            trained_model_path = config.trained_model_path,
            params_epochs = params.EPOCHS,
            params_batch_size = params.BATCH_SIZE,
            params_is_augmentation = params.AUGMENTATION,
            params_image_size = params.IMAGE_SIZE,
            params_learning_rate = self.params.LEARNING_RATE
        )

        return model_training_config

In [50]:
import urllib.request as requests
from zipfile import ZipFile
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
import pytorch_lightning as pl


[2025-07-05 17:26:42,066: INFO: font_manager]: Failed to extract font properties from /System/Library/Fonts/Supplemental/NISC18030.ttf: In FT2Font: Could not set the fontsize (invalid pixel size; error code 0x17)
[2025-07-05 17:26:42,195: INFO: font_manager]: Failed to extract font properties from /System/Library/Fonts/LastResort.otf: tuple indices must be integers or slices, not str
[2025-07-05 17:26:42,248: INFO: font_manager]: Failed to extract font properties from /System/Library/Fonts/Apple Color Emoji.ttc: In FT2Font: Could not set the fontsize (invalid pixel size; error code 0x17)
[2025-07-05 17:26:42,257: INFO: font_manager]: generated new fontManager


In [None]:
class LightningModel(pl.LightningModule):
    def __init__(self, model, learning_rate=0.01):
        super().__init__()
        self.model = model
        self.learning_rate = learning_rate
        self.criterion = nn.CrossEntropyLoss()
    
    def training_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.criterion(outputs, labels)
        acc = (outputs.argmax(dim=1) == labels).float().mean()
        
        self.log('train_loss', loss)
        self.log('train_acc', acc)
        return loss
    
    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.criterion(outputs, labels)
        acc = (outputs.argmax(dim=1) == labels).float().mean()
        
        self.log('val_loss', loss)
        self.log('val_acc', acc)
    
    def test_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.criterion(outputs, labels)
        acc = (outputs.argmax(dim=1) == labels).float().mean()
        
        # Store outputs for epoch-level metrics
        self.test_step_outputs.append({'test_loss': loss, 'test_acc': acc})
        
        self.log('test_loss', loss, on_step=True, on_epoch=True)
        self.log('test_acc', acc, on_step=True, on_epoch=True)
        
        return {'test_loss': loss, 'test_acc': acc}

    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(), lr=self.learning_rate)

    def on_test_epoch_end(self):
        # Calculate average metrics
        if self.test_step_outputs:
            avg_loss = torch.stack([x['test_loss'] for x in self.test_step_outputs]).mean()
            avg_acc = torch.stack([x['test_acc'] for x in self.test_step_outputs]).mean()
            
            self.log('avg_test_loss', avg_loss)
            self.log('avg_test_acc', avg_acc)
            
            # Clear the list for next epoch
            self.test_step_outputs.clear()

    

In [None]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        logger.info(f"Using device: {self.device}")

    
    def get_base_model(self):
        self.model = torch.load(self.config.updated_base_model_path, map_location=self.device)
        self.model.to(self.device)

        logger.info(f"Model loaded from {self.config.updated_base_model_path}")

    def train_valid_generator(self):

        # preparing the validation dataset
        valid_transforms = transforms.Compose([
            transforms.Resize(self.config.params_image_size[:-1]),  # Resize to target size
            transforms.ToTensor(),  # Converts to tensor and scales to [0,1] (equivalent to rescale=1./255)
        ])
        
        # preparing the training dataset
        if self.config.params_is_augmentation:
            train_transforms = transforms.Compose([
                transforms.Resize(self.config.params_image_size[:-1]),
                transforms.RandomRotation(40),  # rotation_range=40
                transforms.RandomHorizontalFlip(p=0.5),  # horizontal_flip=True
                transforms.RandomAffine(
                    degrees=0,
                    translate=(0.2, 0.2),  # width_shift_range=0.2, height_shift_range=0.2
                    scale=(0.8, 1.2),  # zoom_range=0.2
                    shear=0.2  # shear_range=0.2
                ),
                transforms.ToTensor(),
            ])
        else:
            train_transforms = valid_transforms


        # load training dataset
        train_dataset = datasets.ImageFolder(
            root=self.config.training_data_path,
            transform=train_transforms
        )
        logger.info(f"Training dataset created from {self.config.training_data_path}")

        # load validation dataset
        valid_dataset = datasets.ImageFolder(
            root=self.config.validation_data_path,
            transform=valid_transforms
        )
        logger.info(f"Validation dataset created from {self.config.validation_data_path}")
        
        self.train_loader = DataLoader(
            train_dataset,
            batch_size=self.config.params_batch_size,
            shuffle=True,
            num_workers=0,
            pin_memory=True if self.device.type == 'cuda' else False
        )
        

        self.valid_loader = DataLoader(
            valid_dataset,
            batch_size=self.config.params_batch_size,
            shuffle=False,
            num_workers=0,
            pin_memory=True if self.device.type == 'cuda' else False
        )
        
        
        self.train_dataset_size = len(train_dataset)
        self.valid_dataset_size = len(valid_dataset)
        
        logger.info(f"Training samples: {self.train_dataset_size}")
        logger.info(f"Validation samples: {self.valid_dataset_size}")
        logger.info(f"Number of classes: {len(train_dataset.classes)}")
        logger.info(f"Classes: {train_dataset.classes}")
        

    @staticmethod
    def save_model(path: Path, model: nn.Module):
        torch.save(model, path)


    def train(self):
        # Create Lightning model
        lightning_model = LightningModel(
            model=self.model,
            learning_rate=self.config.params_learning_rate
        )
        
        # Create trainer with automatic logging and progress bars
        trainer = pl.Trainer(
            max_epochs=self.config.params_epochs,
            accelerator='auto',  # Automatically use GPU if available
            devices='auto',      # Use all available devices
            logger=True,         # Enable logging
            enable_progress_bar=True,
            enable_model_summary=True,
            enable_checkpointing=True,
            log_every_n_steps=50,
        )
        
        logger.info("Starting training with PyTorch Lightning...")
        
        # Train the model (this replaces all your manual training loop!)
        trainer.fit(
            model=lightning_model,
            train_dataloaders=self.train_loader,
            val_dataloaders=self.valid_loader
        )
        
        # Get final metrics
        train_metrics = trainer.callback_metrics
        
        logger.info("Training completed!")
        logger.info("=" * 60)
        logger.info("FINAL TRAINING METRICS:")
        
        # Print final metrics
        for key, value in train_metrics.items():
            if isinstance(value, torch.Tensor):
                logger.info(f"{key}: {value.item():.4f}")
            else:
                logger.info(f"{key}: {value}")
        
        logger.info("=" * 60)
        
        # Save the trained model
        self.save_model(
            path=self.config.trained_model_path,
            model=lightning_model.model  # Extract the actual model
        )
        
        logger.info(f"Model trained and saved to {self.config.trained_model_path}")
        
        # Return training history for analysis
        return trainer.callback_metrics

'''
    def train(self):

        self.steps_per_epoch = len(self.train_loader)
        self.validation_steps = len(self.valid_loader)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.config.params_learning_rate)
        
        for epoch in range(self.config.params_epochs):
            # Training phase
            self.model.train()
            for inputs, labels in self.train_loader:
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                loss = criterion(self.model(inputs), labels)
                loss.backward()
                optimizer.step()
            
            # Validation phase
            self.model.eval()
            with torch.no_grad():
                for inputs, labels in self.valid_loader:
                    inputs, labels = inputs.to(self.device), labels.to(self.device)
                    criterion(self.model(inputs), labels) 
        
        logger.info(f"Training completed for {self.config.params_epochs} epochs")
        self.save_model(
            path=self.config.trained_model_path,
            model=self.model
        )

        logger.info(f"Model trained and saved to {self.config.trained_model_path}")
    '''

'\n    def train(self):\n\n        self.steps_per_epoch = len(self.train_loader)\n        self.validation_steps = len(self.valid_loader)\n        \n        criterion = nn.CrossEntropyLoss()\n        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.config.params_learning_rate)\n        \n        for epoch in range(self.config.params_epochs):\n            # Training phase\n            self.model.train()\n            for inputs, labels in self.train_loader:\n                inputs, labels = inputs.to(self.device), labels.to(self.device)\n                optimizer.zero_grad()\n                loss = criterion(self.model(inputs), labels)\n                loss.backward()\n                optimizer.step()\n            \n            # Validation phase\n            self.model.eval()\n            with torch.no_grad():\n                for inputs, labels in self.valid_loader:\n                    inputs, labels = inputs.to(self.device), labels.to(self.device)\n                    crit

In [61]:
try:
    config = ConfigurationManager()
    training_config = config.get_model_training_config()
    training = ModelTraining(config=training_config)
    training.get_base_model()
    training.train_valid_generator()
    training.train()
    
except Exception as e:
    raise e

[2025-07-06 00:07:48,112: INFO: common]: yaml file successfully load from config/config.yaml
[2025-07-06 00:07:48,117: INFO: common]: yaml file successfully load from params.yaml
[2025-07-06 00:07:48,118: INFO: common]: directory created at: artifacts
[2025-07-06 00:07:48,118: INFO: common]: directory created at: artifacts/model_training
[2025-07-06 00:07:48,119: INFO: 1726308249]: Using device: cpu
[2025-07-06 00:07:48,143: INFO: 1726308249]: Model loaded from artifacts/model_preparation/updated_base_model.pth
[2025-07-06 00:07:48,147: INFO: 1726308249]: Training dataset created from artifacts/data_ingestion/Data/train
[2025-07-06 00:07:48,149: INFO: 1726308249]: Validation dataset created from artifacts/data_ingestion/Data/valid
[2025-07-06 00:07:48,149: INFO: 1726308249]: Training samples: 613
[2025-07-06 00:07:48,150: INFO: 1726308249]: Validation samples: 72
[2025-07-06 00:07:48,150: INFO: 1726308249]: Number of classes: 4
[2025-07-06 00:07:48,150: INFO: 1726308249]: Classes: ['ad

  self.model = torch.load(self.config.updated_base_model_path, map_location=self.device)


Epoch 0: 100%|██████████| 39/39 [00:18<00:00,  2.06it/s, v_num=3]          [2025-07-06 00:08:09,282: INFO: fit_loop]: `Trainer.fit` stopped: `max_epochs=1` reached.
Epoch 0: 100%|██████████| 39/39 [00:18<00:00,  2.06it/s, v_num=3]
[2025-07-06 00:08:49,330: INFO: 1726308249]: Training completed!
[2025-07-06 00:08:49,332: INFO: 1726308249]: FINAL TRAINING METRICS:
[2025-07-06 00:08:49,332: INFO: 1726308249]: train_loss: 2.6374
[2025-07-06 00:08:49,333: INFO: 1726308249]: train_acc: 0.6000
[2025-07-06 00:08:49,333: INFO: 1726308249]: val_loss: 3.4198
[2025-07-06 00:08:49,333: INFO: 1726308249]: val_acc: 0.3472
[2025-07-06 00:08:49,410: INFO: 1726308249]: Model trained and saved to artifacts/model_training/model.h5
