In [255]:
import sys
import os

# Add the src directory to the system path
sys.path.append(os.path.join(os.getcwd(), "src"))


In [256]:
%pwd 

'd:\\'

In [257]:
os.chdir("../")

In [258]:
%pwd

'd:\\'

In [259]:
from dataclasses import dataclass
from pathlib import Path 

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

In [260]:

from src.textSummarizer.constants import *
from src.textSummarizer.utils.common import read_yaml, create_directories


In [261]:
from src.textSummarizer.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from src.textSummarizer.utils.common import read_yaml, create_directories
from pathlib import Path
from dataclasses import dataclass

from pathlib import Path

# Use absolute paths for the config and params file
CONFIG_FILE_PATH = Path("D:/Text Summarizer Project/Text-Summarizer-Project-/config/config.yaml")
PARAMS_FILE_PATH = Path("D:/Text Summarizer Project/Text-Summarizer-Project-/params.yaml")


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

class ConfigurationManager:
    def __init__(
            self,
            config_filepath: Path = CONFIG_FILE_PATH,
            params_filepath: Path = PARAMS_FILE_PATH):
        
        # Reading the YAML configuration files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        # Creating the directories mentioned in the config file
        create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        # Create directories for data ingestion
        create_directories([config.root_dir])

        # Creating the data ingestion configuration object
        data_ingestion_config = DataIngestionConfig(
            root_dir=Path(config.root_dir),         # Converting to Path object if not already
            source_URL=config.source_URL,           # URL for data source
            local_data_file=Path(config.local_data_file),  # Path for the local data file
            unzip_dir=Path(config.unzip_dir)        # Path for the directory to unzip files
        )

        return data_ingestion_config


In [262]:
import os
import urllib.request as request
import zipfile
from src.textSummarizer.utils.common import get_size

In [263]:
import os
import requests
import zipfile
import logging
import yaml

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class DataIngestion:
    def __init__(self, config):
        self.config = config
        # Create the necessary directories
        self.create_directories()

    def create_directories(self):
        """Create necessary directories for data ingestion."""
        try:
            os.makedirs(self.config['root_dir'], exist_ok=True)
            logger.info(f"Root directory created at: {self.config['root_dir']}")
        except Exception as e:
            logger.error(f"Error creating root directory: {e}")

        try:
            os.makedirs(os.path.dirname(self.config['local_data_file']), exist_ok=True)
            logger.info(f"Directory for local data file created at: {os.path.dirname(self.config['local_data_file'])}")
        except Exception as e:
            logger.error(f"Error creating directory for local data file: {e}")

        try:
            os.makedirs(self.config['unzip_dir'], exist_ok=True)
            logger.info(f"Unzip directory created at: {self.config['unzip_dir']}")
        except Exception as e:
            logger.error(f"Error creating unzip directory: {e}")

    def download_file(self):
        """Download file from the source URL."""
        try:
            logger.info(f"Downloading file from {self.config['source_url']}...")
            response = requests.get(self.config['source_url'])
            if response.status_code == 200:
                with open(self.config['local_data_file'], 'wb') as file:
                    file.write(response.content)
                logger.info("File downloaded successfully.")
            else:
                logger.error(f"Failed to download file: {response.status_code}")
        except Exception as e:
            logger.error(f"Error downloading file: {e}")

    def extract_zip_file(self):
        """Extract the downloaded zip file."""
        try:
            logger.info(f"Extracting {self.config['local_data_file']} to {self.config['unzip_dir']}...")
            with zipfile.ZipFile(self.config['local_data_file'], 'r') as zip_ref:
                zip_ref.extractall(self.config['unzip_dir'])
            logger.info("Files extracted successfully.")
        except zipfile.BadZipFile:
            logger.error("Error: The downloaded file is not a valid zip file.")
        except Exception as e:
            logger.error(f"An error occurred while extracting the zip file: {e}")

# Load configuration from the YAML file
try:
    with open("D:/Text Summarizer Project/Text-Summarizer-Project-/config/config.yaml", 'r') as config_file:
        config = yaml.safe_load(config_file)
        logger.info("Configuration loaded successfully.")
except FileNotFoundError as e:
    logger.error(f"Configuration file not found: {e}")
    raise

# Usage
data_ingestion_config = config['data_ingestion']
data_ingestion = DataIngestion(data_ingestion_config)

# Run the download and extraction processes
data_ingestion.download_file()
data_ingestion.extract_zip_file()


[2024-10-02 00:24:52,360: INFO: 2073371145: Configuration loaded successfully.]
[2024-10-02 00:24:52,360: INFO: 2073371145: Root directory created at: artifacts/data_ingestion]
[2024-10-02 00:24:52,360: INFO: 2073371145: Directory for local data file created at: artifacts/data_ingestion]
[2024-10-02 00:24:52,360: INFO: 2073371145: Unzip directory created at: artifacts/data_ingestion]
[2024-10-02 00:24:52,360: INFO: 2073371145: Downloading file from https://raw.githubusercontent.com/ragztigadi/data_files/c9830f66604257b903098b53d1755e4fbc71657d/summarizer-data.zip...]
[2024-10-02 00:24:54,260: INFO: 2073371145: File downloaded successfully.]
[2024-10-02 00:24:54,275: INFO: 2073371145: Extracting artifacts/data_ingestion/data.zip to artifacts/data_ingestion...]
[2024-10-02 00:24:54,513: INFO: 2073371145: Files extracted successfully.]


In [264]:
import os
import logging
import zipfile
import yaml

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load configuration from the correct YAML file path
try:
    # Use the correct path to your config.yaml file
    with open("D:/Text Summarizer Project/Text-Summarizer-Project-/config/config.yaml", 'r') as config_file:
        config = yaml.safe_load(config_file)
        logger.info("Configuration loaded successfully.")
except FileNotFoundError as e:
    logger.error(f"Configuration file not found: {e}")
    raise  # Optionally re-raise the error after logging

data_ingestion_config = config['data_ingestion']
logger.info(f"Data ingestion configuration: {data_ingestion_config}")

# Check for specific config items
if 'source_url' in data_ingestion_config:
    logger.info(f"Source URL: {data_ingestion_config['source_url']}")
else:
    logger.warning("Source URL not found in the configuration.")


[2024-10-02 00:24:54,521: INFO: 3593489851: Configuration loaded successfully.]
[2024-10-02 00:24:54,526: INFO: 3593489851: Data ingestion configuration: {'root_dir': 'artifacts/data_ingestion', 'source_url': 'https://raw.githubusercontent.com/ragztigadi/data_files/c9830f66604257b903098b53d1755e4fbc71657d/summarizer-data.zip', 'local_data_file': 'artifacts/data_ingestion/data.zip', 'unzip_dir': 'artifacts/data_ingestion'}]
[2024-10-02 00:24:54,527: INFO: 3593489851: Source URL: https://raw.githubusercontent.com/ragztigadi/data_files/c9830f66604257b903098b53d1755e4fbc71657d/summarizer-data.zip]
