## data transformation

In [2]:
1+1

2

In [3]:
import os
%pwd

'/Users/satwiksahoo/Desktop/CodeBasics/machine learning/krish naik/NLP project/text_summarizer/research'

In [4]:
os.chdir('../')

In [5]:
%pwd

'/Users/satwiksahoo/Desktop/CodeBasics/machine learning/krish naik/NLP project/text_summarizer'

In [7]:
from dataclasses import dataclass
from pathlib import Path

@dataclass
class dataTransformationConfig:
    root_dir: Path
    data_path : Path
    tokenizer_name : Path
    


In [13]:
from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml , create_directories

class ConfigurationManager:
    def __init__(self , config_file_path = CONFIG_FILE_PATH , params_file_path = PARAMS_FILE_PATH):
        self.config = read_yaml(config_file_path)
        self.params = read_yaml(params_file_path)
        
        create_directories([self.config.artifacts_root]) 
        
    def get_data_transformation_config(self) ->  dataTransformationConfig:
        config = self.config.data_transformation
        create_directories([config.root_dir])
        
        
        data_transformation_config = dataTransformationConfig(
            
            
            root_dir = config.root_dir,
            data_path = config.data_path,
            tokenizer_name = config.tokenizer_name
            
            
        )  
        
        
        return data_transformation_config
        
  

In [37]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from src.textSummarizer.logging import logger
import os
from datasets import load_from_disk 
import pandas as pd
import re
from datasets import Dataset


class DataTransformation:
    def __init__(self , config : dataTransformationConfig):
        self.config = config 
        self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name) 
    
    def preprocess_text(self , text):
        tokens = []
        text = re.sub(r'\r\n|\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'<.*?>', ' ', text)
        text = text.strip().lower()
        
        return text
    
    def convert(self):
        dataset = load_from_disk(self.config.data_path)
        
        training_data = dataset['train']
        test_data = dataset['test']
        validation_data = dataset['validation']
        
        df_train = pd.DataFrame(training_data)
        df_test = pd.DataFrame(test_data)
        df_validation = pd.DataFrame(validation_data)
        
        df_train_small = df_train.sample(n= 6000 , random_state = 42).reset_index(drop = True)
        df_test_small = df_test.sample(n= 600 , random_state = 42).reset_index(drop = True)
        df_validation_small = df_validation.sample(n= 600 , random_state = 42).reset_index(drop = True)
         
        df_train_small['preprocess_text'] = df_train_small['dialogue'].apply(self.preprocess_text)
        df_test_small['preprocess_text'] = df_test_small['dialogue'].apply(self.preprocess_text)
        df_validation_small['preprocess_text'] = df_validation_small['dialogue'].apply(self.preprocess_text)
        
        train_dataset = Dataset.from_pandas(df_train_small)
        test_dataset = Dataset.from_pandas(df_test_small)
        validation_dataset = Dataset.from_pandas(df_validation_small)
        
        
        train_dataset.save_to_disk(os.path.join(self.config.root_dir , 'train_dataset'))
        test_dataset.save_to_disk(os.path.join(self.config.root_dir , 'test_dataset'))
        validation_dataset.save_to_disk(os.path.join(self.config.root_dir , 'validation_dataset'))
        
        
        
    
    
    
    

In [38]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = DataTransformation(config = data_transformation_config)
data_transformation.convert()

[2025-08-14 13:38:02,896: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-08-14 13:38:02,898: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-14 13:38:02,899: INFO: common: created directory at: artifacts]
[2025-08-14 13:38:02,899: INFO: common: created directory at: artifacts/data_transformation]


Saving the dataset (1/1 shards): 100%|██████████| 6000/6000 [00:00<00:00, 841159.97 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 600/600 [00:00<00:00, 331652.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 600/600 [00:00<00:00, 186676.24 examples/s]
