In [1]:
import os
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from textsummarizer.logging import logger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%pwd

'e:\\PROJECTS\\ML\\Text-Summarizer\\expriments'

In [3]:
%cd ..

e:\PROJECTS\ML\Text-Summarizer


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [4]:
%pwd

'e:\\PROJECTS\\ML\\Text-Summarizer'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformation:
    root_dir: Path
    data_path: Path
    tokenizer_name: str
    do_train: bool = True
    do_eval: bool =  True
    do_predict: bool = True
    text_column: str = "dialogue"
    summary_column: str = "summary"
    max_source_length: int = 1024
    max_target_length: int = 128
    padding: bool = False
    ignore_pad_token_for_loss: bool = False

    def convert(self):
        samsum_dataset = load_from_disk(self.data_path)
        column_names = samsum_dataset.column_names
        if self.do_train:
            train_dataset = samsum_dataset["train"]
            tokenized_train_datasets  = train_dataset.map(
                            self.preprocess_function,
                            batched=True,
                            remove_columns=column_names['train'],
                            desc="Running tokenizer on train dataset",
                        )
            tokenized_train_datasets.save_to_disk(os.path.join(self.root_dir,"train"))

        if self.do_eval:
            validation_dataset = samsum_dataset["validation"]
            tokenized_val_datasets  = validation_dataset.map(
                self.preprocess_function,
                batched=True,
                remove_columns=column_names['validation'],
                desc="Running tokenizer on validation dataset",
            )
            tokenized_val_datasets.save_to_disk(os.path.join(self.root_dir,"validation"))

        if self.do_predict:
            test_dataset = samsum_dataset["test"]
            tokenized_test_datasets  = test_dataset.map(
                self.preprocess_function,
                batched=True,
                remove_columns=column_names['test'],
                desc="Running tokenizer on test dataset",
            )
            tokenized_test_datasets.save_to_disk(os.path.join(self.root_dir,"test"))

    def preprocess_function(self, examples):
        # remove pairs where at least one record is None
        tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)

        inputs, targets = [], []
        for i in range(len(examples[self.text_column])):
            if examples[self.text_column][i] and examples[self.summary_column][i]:
                inputs.append(examples[self.text_column][i])
                targets.append(examples[self.summary_column][i])

        model_inputs = tokenizer(inputs, max_length=self.max_source_length, padding=self.padding, truncation=True)

        # Tokenize targets with the `text_target` keyword argument
        labels = tokenizer(text_target=targets, max_length=self.max_target_length, padding=self.padding, truncation=True)

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if self.padding == "max_length" and self.ignore_pad_token_for_loss:
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
            ]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    

In [6]:
from textsummarizer.constants import *
from textsummarizer.utils.common import read_yaml, create_directories


In [7]:
class ConfigHandler:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH):
        """
        Initialize the ConfigHandler with the provided configuration and parameters file paths.

        Args:
            config_filepath: Path to the configuration file (default: CONFIG_FILE_PATH)
            params_filepath: Path to the parameters file (default: PARAMS_FILE_PATH)

        Returns:
            None

        """
        self.config = self._load_config(config_filepath)  # Load the configuration from the specified file
        self.params = self._load_params(params_filepath)  # Load the parameters from the specified file

        create_directories([self.config.artifacts_root])  # Create necessary directories based on the configuration

    def _load_config(self, config_filepath):
        """
        Load the configuration from the specified YAML file.

        Args:
            config_filepath: Path to the configuration file

        Returns:
            The loaded configuration as a dictionary

        """
        return read_yaml(config_filepath)  # Read and return the configuration from the YAML file

    def _load_params(self, params_filepath):
        """
        Load the parameters from the specified YAML file.

        Args:
            params_filepath: Path to the parameters file

        Returns:
            The loaded parameters as a dictionary

        """
        return read_yaml(params_filepath)  # Read and return the parameters from the YAML file

    
    def get_data_transformation_config(self) -> DataTransformation:
        config = self.config.data_transformation
        common = self.params.CommonParams

        create_directories([config.root_dir])

        data_transformation_config = DataTransformation(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name,
            do_train=common.do_train,
            do_eval=common.do_eval,
            do_predict=common.do_predict,
            text_column=common.text_column,
            summary_column=common.summary_column,
            max_source_length=common.max_source_length,
            max_target_length=common.max_target_length,
            padding=common.padding,
            ignore_pad_token_for_loss=common.ignore_pad_token_for_loss,
        )

        return data_transformation_config

In [8]:
try:
    config = ConfigHandler()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation_config.convert()
except Exception as e:
    raise e

[2024-04-22 12:00:05,546: INFO: common: Yaml file: config\config.yaml loaded successfully]
[2024-04-22 12:00:05,566: INFO: common: Yaml file: params.yaml loaded successfully]
[2024-04-22 12:00:05,571: INFO: common: Created directory at: artifacts]
[2024-04-22 12:00:05,581: INFO: common: Created directory at: artifacts/data_transformation]


  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
Running tokenizer on train dataset: 100%|██████████| 14732/14732 [01:41<00:00, 144.53 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14731/14731 [00:00<00:00, 83542.07 examples/s]
Running tokenizer on validation dataset: 100%|██████████| 818/818 [00:05<00:00, 158.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 19025.48 examples/s]
Running tokenizer on test dataset: 100%|██████████| 819/819 [00:11<00:00, 72.43 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 32761.44 examples/s]
