In [5]:
import os 

In [6]:
os.chdir('../')

In [3]:
os.getcwd()

'c:\\Users\\Admin\\Documents\\Udemy MLops course\\Summarizer using HuggingFace'

In [18]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationConfig:
    """Data Transformation Configurations"""
    root_dir: Path 
    transformed_data_dir: Path
    tokenizer_name: str
    


In [19]:
from src.textsummarizer.constants import *
from src.textsummarizer.utils.common import create_directories, read_yaml

In [20]:
config = read_yaml(CONFIG_FILE_PATH)
config = config.data_transformation 
print(config)

2025-11-10 12:37:21,224 - INFO -  File contents : ]
2025-11-10 12:37:21,225 - INFO - YAML file: config\config.yaml loaded successfully]
{'root_dir': 'artifacts/data_transformation', 'transformed_data_dir': 'artifacts/data_transformation/transformed_data', 'tokenizer_name': 'google/pegasus-cnn_dailymail'}


In [33]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=Path(config.root_dir),
            transformed_data_dir=Path(config.transformed_data_dir),
            tokenizer_name=config.tokenizer_name)
        return data_transformation_config
    

In [34]:
from src.textsummarizer.logging import logger
from transformers import AutoTokenizer
from datasets import load_from_disk


In [37]:
class DataTrasnformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)

    def convert_examples_to_features(self,example_batch):
        input_encodings = self.tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )

        with self.tokenizer.as_target_tokenizer():
            target_encodings = self.tokenizer(example_batch['summary'], max_length = 128, truncation = True )

        return {
            'input_ids' : input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }
    
    def convert(self):
        dataset_samsum = load_from_disk(self.config.transformed_data_dir)
        logger.info(f"Dataset loaded from disk : {self.config.transformed_data_dir}")
        dataset_samsum_encoded = dataset_samsum.map(self.convert_examples_to_features, batched = True)
        dataset_samsum_encoded.save_to_disk(self.config.root_dir)
        logger.info(f"Dataset saved to disk : {self.config.transformed_data_dir}")



In [38]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = DataTrasnformation(config = data_transformation_config)
data_transformation.convert()

2025-11-10 13:06:57,164 - INFO -  File contents : ]
2025-11-10 13:06:57,165 - INFO - YAML file: config\config.yaml loaded successfully]
2025-11-10 13:06:57,167 - INFO -  File contents : ]
2025-11-10 13:06:57,168 - INFO - YAML file: config\params.yaml loaded successfully]
2025-11-10 13:06:57,171 - INFO - Directory created at: artifacts]
2025-11-10 13:06:57,172 - INFO - Directory created at: artifacts/data_transformation]
2025-11-10 13:06:58,671 - INFO - Dataset loaded from disk : artifacts\data_ingestion\unzip_data\samsum_dataset]


Map: 100%|██████████| 14732/14732 [00:03<00:00, 3702.18 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 3150.99 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 3164.27 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 409563.84 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 88145.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 41249.17 examples/s]

2025-11-10 13:07:03,412 - INFO - Dataset saved to disk : artifacts\data_ingestion\unzip_data\samsum_dataset]



