Changing Current Directory to Root D.

In [1]:
import os

In [2]:
%pwd

'e:\\code\\project\\Text-Summarizer\\research'

In [3]:
os.chdir('../')
%pwd

'e:\\code\\project\\Text-Summarizer'

3.Updating Entity

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)                     #frozen = True makes the class immutable (read-only)
class DataIngestionConfig:             #defininig return type of the function which will be made later 
    root_dir: Path
    source_URL: str
    local_data_file: Path
    unzip_dir: Path

4. Update the configuration manager in src config

In [5]:
from textSummarizer.utils import read_yaml,create_directories 
from textSummarizer import constants

class ConfigurationManager:      #class to read the config file and params file as well as return DataIngestion object
    def __init__(
            self,
            config_file_path = constants.CONFIG_FILE_PATH,       #constants is a file which contains all the paths 
            params_file_path = constants.PARAMS_FILE_PATH
            ):
        
        self.config = read_yaml(config_file_path)      #read_yaml is a custom function created in utils.py which reads a yaml file and returns ConfigBox 
        self.params = read_yaml(params_file_path)

        #config is a ConfigBox(shown in trials.ipynb) object which is a dictionary with dot notation access with help of read_yaml function
        create_directories([self.config.artifacts_root])              #create_directories is a function which creates directories if they don't exist; artifacts_root is the path where all the artifacts will be stored present in the config.yaml file

    def get_data_ingestion_config(self) -> DataIngestionConfig:       #function to return the DataIngestionConfig object
        config = self.config.data_ingestion               #calling data_ingestion medthod of yaml file 

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            source_URL = config.source_URL,
            local_data_file = config.local_data_file,
            unzip_dir = config.unzip_dir
        )

        # return DataIngestionConfig(
        #     root_dir = config.root_dir,
        #     source_URL = config.source_URL,
        #     local_data_file = config.local_data_file,
        #     unzip_dir = config.unzip_dir
        # )        

        return data_ingestion_config

5. Updating Components

In [6]:
import urllib.request as request
import zipfile
from textSummarizer.logging import logger
from textSummarizer.utils import get_size

class DataIngestion:                  #class to download and unzip the data
    def __init__(self,config: DataIngestionConfig):
        self.config = config
    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            file_name, headers = request.urlretrieve(
                url=self.config.source_URL,
                filename=self.config.local_data_file
            )
            logger.info(f"Downloaded file: {file_name}")
        else:
            logger.info(f"File already exists: {self.config.local_data_file}")

    def unzip_file(self):
        '''
        Extracts the zip file into data dictionary
        Function returns none
        '''
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path,exist_ok=True)
        
        logger.info(f"Unzipping {self.config.local_data_file} to {unzip_path}")
        with zipfile.ZipFile(self.config.local_data_file,"r") as zip_ref:
            zip_ref.extractall(unzip_path)

Creating Pipeline

In [7]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()	
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    data_ingestion.unzip_file()
except Exception as e:
    raise e