In [1]:
import os
os.chdir("..")
%pwd

'c:\\ML\\NLP-Text-Summarization'

In [2]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class DataTransformationEntity:
    root_dir: Path
    data_path: Path
    tokenizer_name: str

In [3]:
from src.constants import *
from src.utils.common import CommonUtils
from box import ConfigBox
from src.custom_exception import CustomException
import sys
class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH, params_path = PARAMS_FILE_PATH):
        self.config: ConfigBox = CommonUtils.read_yaml(config_path)
        self.params: ConfigBox = CommonUtils.read_yaml(params_path)

        CommonUtils.create_directories([self.config.artifacts_root])

    def data_transformation_configuration(self)-> DataTransformationEntity:
        try:
            config = self.config.data_transformation
            CommonUtils.create_directories([config.root_dir])
            return DataTransformationEntity(
                root_dir=config.root_dir,
                data_path=config.data_path,
                tokenizer_name=config.tokenizer_name
            )
        except Exception as exp:
            raise CustomException(exp, sys)

In [6]:
from transformers import AutoTokenizer
from datasets import load_from_disk

In [9]:

class DataTransformation:
    def __init__(self, config: DataTransformationEntity):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name)


    def convert_examples_to_features(self, example_batch):
        try:
            input_encoding = self.tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)

            with self.tokenizer.as_target_tokenizer():
                target_encoding = self.tokenizer(example_batch['summary'], max_length=128,  truncation=True)

            return{
                'input_ids': input_encoding['input_ids'],
                'attention_mask': input_encoding['attention_mask'],
                'labels': target_encoding['input_ids']
            }
        except Exception as exp:
            raise CustomException(exp, sys)
        
    def convert(self):
        samsum_dataset = load_from_disk(self.config.data_path)
        samsum_dataset_pt = samsum_dataset.map(self.convert_examples_to_features, batched=True)
        samsum_dataset_pt.save_to_disk(os.path.join(self.config.root_dir, "samsum_dataset"))

In [10]:
try:
    config_manager_obj = ConfigurationManager()
    data_transformation_conf: DataTransformationEntity = config_manager_obj.data_transformation_configuration()
    data_transformation_obj = DataTransformation(data_transformation_conf)
    data_transformation_obj.convert()
except Exception as exp:
            raise CustomException(exp, sys)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 14732/14732 [00:04<00:00, 3071.21 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 2308.45 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 2018.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 442469.95 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 116980.59 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 818/818 [00:00<00:00, 136143.04 examples/s]
