In [1]:
import os

In [2]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig: # defined for the config components present in artifacts for model training
    root_dir : Path 
    data_path_train : Path
    data_path_validation : Path
    tokenizer_ckpt : Path
    model_ckpt : Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    weight_decay: float
    eval_steps: int
    save_steps: int
    save_total_limit: int

In [6]:
# Configuration manager
from ToxicCommentClassifier.constants import *
from ToxicCommentClassifier.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
    # Here we are reading the yaml file and we can now use the file paths and parameter values present inside pararms and config.yaml        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) # Here we are calling the artifacts_root key values using '.' , which was the purpose of @ensure_annotations

    def get_model_trainer_config(self) -> ModelTrainerConfig:

        config= self.config.model_trainer # Calling the model_trainer dictionary created in config.yaml file
        params=self.params.TrainingArguments # Calling the TrainingArguments dictionary in params.yaml file

        create_directories([config.root_dir]) # Creating a directory using the root directory

        model_trainer_config = ModelTrainerConfig( # Extracting the values from the config.yaml to here inside data_ingestion_config
            root_dir=config.root_dir,
            data_path_train=config.data_path_train,
            data_path_validation=config.data_path_validation,
            tokenizer_ckpt=config.tokenizer_ckpt,
            model_ckpt=config.model_ckpt,
            num_train_epochs=params.num_train_epochs,
            warmup_steps=params.warmup_steps,
            per_device_train_batch_size=params.per_device_train_batch_size,
            per_device_eval_batch_size= params.per_device_eval_batch_size,
            weight_decay=params.weight_decay,
            eval_steps=params.eval_steps,
            save_steps=params.save_steps,
            save_total_limit=params.save_total_limit,
        )

        return model_trainer_config

In [7]:
# Multi-Label Classification Evaluation Metrics
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, hamming_loss
from transformers import EvalPrediction
import torch

class MultiLabelMetric:
  def multi_labels_metrics(self,predictions, labels, threshold=0.3):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs>=threshold)] = 1
    y_true = labels

    f1 = f1_score(y_true, y_pred, average = 'macro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'macro')
    hamming = hamming_loss(y_true, y_pred)

    metrics = {
        "roc_auc": roc_auc,
        "hamming_loss": hamming,
        "f1": f1
    }

    return metrics

  def compute_metrics(self,p:EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

    result = self.multi_labels_metrics(predictions=preds,labels=p.label_ids)

    return result

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

class CustomDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_len=128):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    label = torch.tensor(self.labels[idx])

    encoding = self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors='pt')

    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': label
    }

In [11]:
#Tokenization and Model Trainer

from transformers import Trainer, TrainingArguments
import pandas as pd
import ast

class ModelTrainer:
    def __init__(self,config: ModelTrainerConfig):
        self.config = config
        self.multi_label_metric = MultiLabelMetric()
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_ckpt)
    

    def tokenize(self):

        dataset_train = pd.read_csv(self.config.data_path_train)
        dataset_validation = pd.read_csv(self.config.data_path_validation)

        dataset_train['label'] = dataset_train['label'].apply(lambda x: [float(val) for val in x.strip('[]').split(', ')])
        dataset_validation['label'] = dataset_validation['label'].apply(lambda x: [float(val) for val in x.strip('[]').split(', ')])

        train_text = dataset_train['comment_text'].to_list()
        validation_text = dataset_validation['comment_text'].to_list()

        train_labels = dataset_train['label'].to_list()
        validation_labels = dataset_validation['label'].to_list()

        train_tokenized_dataset = CustomDataset(train_text,train_labels,tokenizer=self.tokenizer)
        validation_tokenized_dataset = CustomDataset(validation_text,validation_labels, tokenizer=self.tokenizer)

        return train_tokenized_dataset, validation_tokenized_dataset

    def train(self):

        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Loading the tokenized datasets
        train_dataset_tokenized , validation_dataset_tokenized = self.tokenize()

        # Loading the model
        distilbert_model = AutoModelForSequenceClassification.from_pretrained(self.config.model_ckpt,num_labels=6,problem_type="multi_label_classification").to(device)

        # Loading the training arguments
        training_args = TrainingArguments(
            output_dir = self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_eval_batch_size,
            weight_decay=self.config.weight_decay,
            save_steps=self.config.save_steps,
            eval_steps=self.config.eval_steps,
            save_total_limit=self.config.save_total_limit
        )

        #Trainer 
        trainer = Trainer(model=distilbert_model,
                  args=training_args,
                  train_dataset=train_dataset_tokenized,
                  eval_dataset = validation_dataset_tokenized,
                  compute_metrics=self.multi_label_metric.compute_metrics)

        #Model Training
        trainer.train()

        #Saving model and tokenizer
        distilbert_model.save_pretrained(os.path.join(self.config.root_dir,"distilbert_full_finetuning_toxic_comments"))

In [12]:
#Pipeline
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config() # Storing the configuration
    model_training = ModelTrainer(config=model_trainer_config) # Using the configuration saved earlier to call model_training
    model_training.tokenize()
    model_training.train()
except Exception as e:
    raise e

[2024-04-18 16:15:38,477: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-18 16:15:38,479: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-18 16:15:38,481: INFO: common: created directory at: artifacts]
[2024-04-18 16:15:38,482: INFO: common: created directory at: artifacts/model_trainer]


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
                                        
  0%|          | 0/3245 [01:46<?, ?it/s]          

{'loss': 0.3111, 'grad_norm': 1.3416707515716553, 'learning_rate': 5e-05, 'epoch': 0.15}


                                        
  0%|          | 0/3245 [02:30<?, ?it/s]           

{'loss': 0.1837, 'grad_norm': 1.2588824033737183, 'learning_rate': 4.0892531876138436e-05, 'epoch': 0.31}


                                        
  0%|          | 0/3245 [03:23<?, ?it/s]           


{'loss': 0.1691, 'grad_norm': 0.32018765807151794, 'learning_rate': 3.178506375227687e-05, 'epoch': 0.46}


                                          9.18it/s][A
  0%|          | 0/3245 [04:19<?, ?it/s]           

{'loss': 0.1606, 'grad_norm': 1.545218586921692, 'learning_rate': 2.2677595628415303e-05, 'epoch': 0.62}


                                        
  0%|          | 0/3245 [05:18<?, ?it/s]           

{'loss': 0.1525, 'grad_norm': 1.7061657905578613, 'learning_rate': 1.3570127504553735e-05, 'epoch': 0.77}


                                        
  0%|          | 0/3245 [06:14<?, ?it/s]           

{'loss': 0.1539, 'grad_norm': 2.4876508712768555, 'learning_rate': 4.4626593806921675e-06, 'epoch': 0.92}


                                        
100%|██████████| 3245/3245 [05:38<00:00,  9.59it/s]


{'train_runtime': 338.2057, 'train_samples_per_second': 76.758, 'train_steps_per_second': 9.595, 'train_loss': 0.18546095320550246, 'epoch': 1.0}
