In [1]:
import os

In [2]:
%pwd

'd:\\Desktop\\Final_Projects\\TextShortify-Text_Summariser_Tool\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'd:\\Desktop\\Final_Projects\\TextShortify-Text_Summariser_Tool'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [6]:
from TextShortify.constants import *
from TextShortify.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(self,config_filepath = CONFIG_FILE_PATH,params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name=config.tokenizer_name
        )

        return data_transformation_config

In [7]:
import os
from TextShortify.logging import logger
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
from datasets import Dataset
import pandas as pd
class DataTransformation(Dataset):
    def __init__(self,data,config: DataTransformationConfig):
        super().__init__(data)
        self.config = config
        self.tokenizer= AutoTokenizer.from_pretrained(config.tokenizer_name)
        
    def __len__(self):
        return len(self.data)
    
    def convert_examples_to_features(self,idx):
        item = self.data.iloc[idx]  # Get the row at the specified index
        judgement = item['judgement'] # Extract dialogue from the row
        summary = item['summary']   # Extract summary from the row

        # Encode the dialogue as input data for the model
        source = self.tokenizer.encode_plus(
            judgement, 
            max_length=2048, 
            padding='max_length', 
            truncation=True
        )

        # Encode the summary as target data for the model
        target = self.tokenizer.encode_plus(
            summary, 
            max_length=512, 
            padding='max_length', 
            truncation=True
        )

        # Return a dictionary containing input_ids, attention_mask, labels, and the original summary text
        return {
            'input_ids': source['input_ids'].flatten(),
            'attention_mask': source['attention_mask'].flatten(),
            'labels': target['input_ids'].flatten(),
            'summary': summary 
        }

    # def transform_and_save(self):
    # # Get a list of all CSV files in the directory
        
    #         # Transform the data using the convert_examples_to_features method
    #     transformed_dataset = data.map(self.convert_examples_to_features, batched=True)

    #         # # Save the transformed data
    #     transformed_dataset.save_to_disk(os.path.join(self.config.root_dir, 'transformed_data'))
   

In [21]:
import pyarrow as pa
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    train_df= pd.read_csv(os.path.join(data_transformation_config.data_path, "train.csv"))
    test_df= pd.read_csv(os.path.join(data_transformation_config.data_path, "test.csv"))
    # Convert pandas DataFrame to Arrow Table
    train_table = pa.Table.from_pandas(train_df)
    test_table = pa.Table.from_pandas(test_df)

    # Pass Arrow Tables to DataTransformation
    data_transformation_train = DataTransformation(config=data_transformation_config, data=train_table)
    data_transformation_test = DataTransformation(config=data_transformation_config, data=test_table)
    train_dataset=data_transformation_train.save_to_disk(os.path.join(data_transformation_config.root_dir, 'train_data'))
    test_dataset=data_transformation_test.save_to_disk(os.path.join(data_transformation_config.root_dir, 'test_data'))
except Exception as e:
    raise e

[2023-12-30 22:54:08,744: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-12-30 22:54:08,752: INFO: common: yaml file: params.yaml loaded successfully]
[2023-12-30 22:54:08,760: INFO: common: created directory at: artifacts]
[2023-12-30 22:54:08,760: INFO: common: created directory at: artifacts/data_transformation]


Saving the dataset (1/1 shards): 100%|██████████| 7773/7773 [00:00<00:00, 21828.91 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 200/200 [00:00<00:00, 8323.02 examples/s] 
