In [None]:
import os

In [None]:
%pwd

In [None]:
%cd ..

In [None]:
%pwd

In [None]:
from textsummarizer.constants import *
from textsummarizer.utils.common import read_yaml, create_directories

In [None]:
from dataclasses import dataclass
from pathlib import Path
from datetime import datetime
import pandas as pd


@dataclass(frozen=True)
class DataVerification:
    root_dir: Path
    STATUS_FILE: str
    INFO_FILE: str
    ALL_REQUIRED_FILES: list


    def verify_all_files_exist(self)-> bool:
        try:
            verification_status = None

            all_files = os.listdir(os.path.join("artifacts","data_acquisition","samsum_dataset"))

            for file in all_files:
                if file not in self.ALL_REQUIRED_FILES:
                    verification_status = False
                    with open(self.STATUS_FILE, 'w') as f:
                        f.write(f"Verification status: {verification_status}")
                else:
                    verification_status = True
                    with open(self.STATUS_FILE, 'w') as f:
                        f.write(f"Verification status: {verification_status}")

            return verification_status
        
        except Exception as e:
            raise e

    def get_csv_files_info(self):
        try:
            folder_path = os.path.join("artifacts", "data_acquisition")
            with open(self.INFO_FILE, 'w') as f:  # Open the file in write mode to clear previous content
                f.write("CSV File Metadata:\n")

            for file_name in os.listdir(folder_path):
                if file_name.endswith('.csv'):
                    file_path = os.path.join(folder_path, file_name)
                    file_size = os.path.getsize(file_path)

                    # Read the CSV file using pandas to extract additional metadata
                    df = pd.read_csv(file_path)
                    num_rows = len(df)
                    data_types = df.dtypes
                    

                    # Get the creation/modification date of the file
                    modification_time = os.path.getmtime(file_path)
                    modification_date = datetime.fromtimestamp(modification_time).strftime('%Y-%m-%d %H:%M:%S')

                    with open(self.INFO_FILE, 'a') as f:  # Open the file in append mode to add metadata
                        f.write(f"File Name: {file_name}\n")
                        f.write(f"File Size: {file_size} bytes\n")
                        f.write(f"Number of Rows: {num_rows}\n")
                        f.write(f"Data Types:\n{data_types}\n")
                        f.write(f"Last Modified: {modification_date}\n\n")
                        f.write("************************************")
                        f.write("************************************\n\n\n")
        except Exception as e:
            raise e

  

In [None]:
class ConfigHandler:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH):
        """
        Initialize the ConfigHandler with the provided configuration and parameters file paths.

        Args:
            config_filepath: Path to the configuration file (default: CONFIG_FILE_PATH)
            params_filepath: Path to the parameters file (default: PARAMS_FILE_PATH)

        Returns:
            None

        """
        self.config = self._load_config(config_filepath)  # Load the configuration from the specified file
        self.params = self._load_params(params_filepath)  # Load the parameters from the specified file

        create_directories([self.config.artifacts_root])  # Create necessary directories based on the configuration

    def _load_config(self, config_filepath):
        """
        Load the configuration from the specified YAML file.

        Args:
            config_filepath: Path to the configuration file

        Returns:
            The loaded configuration as a dictionary

        """
        return read_yaml(config_filepath)  # Read and return the configuration from the YAML file

    def _load_params(self, params_filepath):
        """
        Load the parameters from the specified YAML file.

        Args:
            params_filepath: Path to the parameters file

        Returns:
            The loaded parameters as a dictionary

        """
        return read_yaml(params_filepath)  # Read and return the parameters from the YAML file

    
    def get_data_verification_config(self) -> DataVerification:
        config = self.config.data_verification

        create_directories([config.root_dir])

        data_verification_config = DataVerification(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            INFO_FILE=config.INFO_FILE,
            ALL_REQUIRED_FILES=config.ALL_REQUIRED_FILES,
        )

        return data_verification_config


In [None]:
import os
from textsummarizer.logging import logger

In [None]:
try:
    config = ConfigHandler()
    data_verification = config.get_data_verification_config()
    data_verification.verify_all_files_exist()
    data_verification.get_csv_files_info()
except Exception as e:
    raise e