In [1]:
import os

In [2]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier\\research'

In [6]:
os.chdir("../")

In [7]:
%pwd

'e:\\Projects for portfolio\\Toxic Comment Classifier'

In [8]:
# Entity
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig: # defined for the config components present in artifacts for data transformation
    root_dir : Path 
    data_path : Path
    tokenizer_name : Path

In [9]:
# Configuration manager
from ToxicCommentClassifier.constants import *
from ToxicCommentClassifier.utils.common import read_yaml,create_directories

class ConfigurationManager:
    def __init__(
            self,
            config_filepath = CONFIG_FILE_PATH,
            params_filepath = PARAMS_FILE_PATH):
    # Here we are reading the yaml file and we can now use the file paths present inside pararms and config.yaml        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root]) # Here we are calling the artifacts_root key values using '.' , which was the purpose of @ensure_annotations

    def get_data_transformation_config(self) -> DataTransformationConfig: # Here we are using the entity to specify the return type classes to make sure proper output is returned
        config= self.config.data_transformation # Calling the data_validation dictionary created in config.yaml file

        create_directories([config.root_dir]) # Creating a directory using the root directory

        data_transformation_config = DataTransformationConfig( # Extracting the values from the config.yaml to here inside data_ingestion_config
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config

In [15]:
#Data Transformation
import pickle
import torch
import pandas as pd
from ToxicCommentClassifier.logging import logger
from transformers import AutoTokenizer

In [21]:
class DataTransformation:
    def __init__(self,config:DataTransformationConfig): # It will take the configuration from DataIngestionConfig defined earlier , which will in turn use Configuration Manager to take data from config.yaml
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)

    def load_dataset(self): # Used for tokenization of input text field

        # Loading the dataset
        with open(self.config.data_path, "rb") as f:
            dataset = pickle.load(f)
            return dataset
        
    def transform(self):

        label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
        dataset = self.load_dataset()

        # Creating seperate train and validation datasets and concatenating the label columns into a list
        dataset_train = pd.DataFrame(dataset['train'])
        dataset_validation = pd.DataFrame(dataset['validation'])
        
        dataset_train['label'] = dataset_train[label_columns].astype('float32').apply(list, axis=1)
        dataset_train = dataset_train[['comment_text','label']]
        
        dataset_validation['label'] = dataset_validation[label_columns].astype('float32').apply(list, axis=1)
        dataset_validation = dataset_validation[['comment_text','label']]

        return dataset_train,dataset_validation

    def tokenize(self,example_batch):

        encodings = self.tokenizer(example_batch['comment_text'],max_length=1024,padding=True,truncation=True)

        label_tensors = torch.tensor(example_batch['label'])

        return {'input_ids': encodings['input_ids'],
                'attention_mask': encodings['attention_mask'],
                'labels': label_tensors}

    def convert(self):
        train_dataset , validation_dataset = self.transform()

        #Tokenizing the datasets
        train_dataset_tokenized = train_dataset.map(self.tokenize)
        validation_dataset_tokenized = validation_dataset.map(self.tokenize)

        #Saving the tokenized datasets
        train_dataset_tokenized.save_to_disk(self.config.root_dir / 'train_tokenized.pt')
        validation_dataset_tokenized.save_to_disk(self.config.root_dir / 'validation_tokenized.pt')

In [22]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config() # Storing the configuration
    data_transformation = DataTransformation(config=data_transformation_config) # Using the configuration saved earlier to call model_transformation
    data_transformation.convert()
except Exception as e:
    raise e

[2024-04-16 01:22:04,971: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-16 01:22:04,974: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-16 01:22:04,974: INFO: common: created directory at: artifacts]
[2024-04-16 01:22:04,974: INFO: common: created directory at: artifacts/data_transformation]


TypeError: string indices must be integers