In [1]:
import os

In [2]:
%pwd

'/Users/shifaafreensiddiqui/Desktop/Projects/NLP/Text-Summarization/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/Users/shifaafreensiddiqui/Desktop/Projects/NLP/Text-Summarization'

In [5]:
from dataclasses import dataclass
from pathlib import Path
from typing import Dict

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    dataset_name: str
    splits: Dict[str, str]
    save_local: bool = False  # Optional: for saving locally


In [6]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [18]:
from pathlib import Path

class ConfigurationManager:
    def __init__(
        self,
        config_filepath: Path = CONFIG_FILE_PATH,
        params_filepath: Path = PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])  # Note: dictionary access now

    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config["root_dir"]])

        data_ingestion_config = DataIngestionConfig(
            # root_dir=config.root_dir,
            # dataset_name=config.dataset_name,
            # splits=config.splits,
            # save_local=config.get("save_local", False)
            
            root_dir=Path(config["root_dir"]),  # ✅ this line is crucial
            dataset_name=config["dataset_name"],
            splits=config["splits"],
            save_local=config.get("save_local", False)
        )
        
    


        return data_ingestion_config


In [19]:
import os
from datasets import load_dataset
from pathlib import Path
from textSummarizer.logging import logger
from textSummarizer.utils.common import get_size
# from textSummarizer.utils.common import save_json   # a helper to save datasets

In [20]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def fetch_and_optionally_save(self):
        logger.info(f"Loading dataset: {self.config.dataset_name}")
        
        for split_name, hf_split in self.config.splits.items():
            logger.info(f"Loading {split_name} split...")
            dataset_split = load_dataset(self.config.dataset_name, split=hf_split)

            if self.config.save_local:
                save_path = self.config.root_dir / f"{split_name}.json"
                logger.info(f"Saving {split_name} to {save_path}")
                dataset_split.to_json(str(save_path), orient="records", lines=True)

        logger.info("Data ingestion complete.")


In [21]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.fetch_and_optionally_save()
except Exception as e:
    raise e


[2025-06-04 00:41:03,432: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-06-04 00:41:03,434: INFO: common: yaml file: params.yaml loaded successfully]
[2025-06-04 00:41:03,436: INFO: common: created directory at: artifacts]
[2025-06-04 00:41:03,436: INFO: common: created directory at: artifacts/data_ingestion]
[2025-06-04 00:41:03,437: INFO: 3364594946: Loading dataset: knkarthick/samsum]
[2025-06-04 00:41:03,438: INFO: 3364594946: Loading train split...]
[2025-06-04 00:41:06,189: INFO: 3364594946: Saving train to artifacts/data_ingestion/train.json]


Creating json from Arrow format: 100%|██████████| 15/15 [00:00<00:00, 228.75ba/s]

[2025-06-04 00:41:06,264: INFO: 3364594946: Loading validation split...]





[2025-06-04 00:41:08,845: INFO: 3364594946: Saving validation to artifacts/data_ingestion/validation.json]


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 183.31ba/s]

[2025-06-04 00:41:08,853: INFO: 3364594946: Loading test split...]





[2025-06-04 00:41:10,666: INFO: 3364594946: Saving test to artifacts/data_ingestion/test.json]


Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 144.99ba/s]

[2025-06-04 00:41:10,677: INFO: 3364594946: Data ingestion complete.]



