config.yaml

```
data_transformation:
   root_dir: artifacts/data_transformation
   data_path: artifacts/data_ingestion/samsun_dataset
   tokenizer_name: google/pegasus-cnn_dailymail # to create abstract 
   ```

# ✅ Summary


1. Loading a preprocessed text dataset (e.g., dialogues and summaries),

2. Tokenizing the input (dialogue) and output (summary) text,

3. Converting the dataset into a format suitable for training a transformer model (e.g., BART, T5),

4. And saving the processed dataset to disk.

In [1]:
from textSummarizer.config import *
from textSummarizer.constants import *
from textSummarizer.logging import logger
from textSummarizer.entity import DataTransformationConfig
from textSummarizer.utils.common import read_yaml,create_directories, get_size

from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk
import os

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-26 11:20:41,101: INFO: config: PyTorch version 2.7.0+cu126 available.]
[2025-04-26 11:20:41,105: INFO: config: TensorFlow version 2.19.0 available.]


In [5]:
class ConfigurationManager:
    def __init__(self, config_path = CONFIG_FILE_PATH, params_path = PARAMS_FILE_PATH):

        self.config = read_yaml(config_path)
        self.params = read_yaml(PARAMS_FILE_PATH)

        create_directories([self.config.artifacts_root]) # mentioning the artificat directory

    def get_data_transformation(self) ->DataTransformationConfig:
          
            # root_dir: Path
            # data_path: Path
            # tokenizer_name: Path
            
            config =  self.config.data_transformation


            create_directories([config.root_dir]) # to create  the data_transformation directory
            data_transformation_config = DataTransformationConfig(
                root_dir = config.root_dir,
                data_path = config.data_path,
                tokenizer_name= config.tokenizer_name
            )
            
            return data_transformation_config


In [20]:
class DataTransformation:
    def __init__(self, config:DataTransformationConfig):
      
      self.config = config
      self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name) #Tokenizes the dialogue field into input tokens

    def convert_examples_to_feature(self,example_batch): #we are seggregating dialogue and summary
        input_encodings = self.tokenizer(example_batch['dialogue'], max_length=256, truncation=True)
        
        # target is summary
        with self.tokenizer.as_target_tokenizer(): #Makes the tokenizer treat the next text (summary) as target text (important for models like T5).
            target_encodings = self.tokenizer(example_batch['summary'], max_length=128, truncation=True)
       
        return {
            'input_ids': input_encodings['input_ids'], # tokenize dialogue
            'attention_mask': input_encodings['attention_mask'], #  Attention mask for padding,
            'label': target_encodings['input_ids'] #  Tokenized summary (target for training).
        } 
    
    # loading the dataset

    def convert(self):
        samsum_dataset = load_dataset("csv", data_files={"train": str(self.config.data_path)})

        samsum_dataset_pt = samsum_dataset.map(self.convert_examples_to_feature, batched=True)

        samsum_dataset_pt.save_to_disk(os.path.join(self.config.root_dir, 'samsum_dataset')) #under this folder will be creating train folder
        

In [21]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation()
print("data_transformation_config--", data_transformation_config)
data_transformation = DataTransformation(config=data_transformation_config)
data_transformation.convert()

[2025-04-26 14:14:03,195: INFO: common: created directory at: artifacts]


[2025-04-26 14:14:03,205: INFO: common: created directory at: artifacts/data_transformation]
data_transformation_config-- DataTransformationConfig(root_dir='artifacts/data_transformation', data_path='artifacts/data_ingestion/samsum-test.csv', tokenizer_name='google/pegasus-cnn_dailymail')


Map: 100%|██████████| 819/819 [00:00<00:00, 3566.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 81890.32 examples/s] 


In [13]:
(print(os.getcwd()))

d:\Datascience\DL\DL Projects\End_to_End_Text_Summarizer\research
