In [7]:
import os 

os.chdir('../')  

In [None]:
import pandas as pd 

# data ingestion 

In [49]:
# Entity 

from dataclasses import dataclass 
from pathlib import Path 

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_query: str  
    load_data: Path

In [50]:
# configuration manager in src config

import warnings 

from loan_default_risk.constants import * 
from loan_default_risk.utils.common import read_yaml, create_directories 

warnings.filterwarnings('ignore')

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):


        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)


        create_directories([self.config.artifacts_root])

    # data ingestion 💉
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion


        create_directories([config.root_dir])


        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_query=config.source_query,
            load_data=config.load_data,
        )


        return data_ingestion_config

In [51]:
# components 
import os
from sqlalchemy import create_engine, text 
from urllib.parse import quote
from loan_default_risk import logger
from loan_default_risk.utils.common import get_size
from ensure import ensure_annotations




In [52]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config

    def fetch_data_from_database(self):
        if not os.path.exists(self.config.load_data):

            engine = create_engine(f"mysql+pymysql://root:{quote('Reva@0411')}@localhost/loan_credit")
            sql = text(self.config.source_query)
            with engine.connect() as connection:
                result = connection.execute(sql)
                df = pd.DataFrame(result.fetchall(), columns=result.keys())
        
            df.to_csv(self.config.load_data, index=False)
            logger.info(f"{self.config.load_data} downloaded! with following info: loanDataset.csv")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.load_data))}")

            
            

In [53]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.fetch_data_from_database()
except Exception as e:
    print(e)

[2025-02-28 23:41:25,292: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-02-28 23:41:25,293: INFO: common: yaml file: params.yaml loaded successfully]
[2025-02-28 23:41:25,294: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-02-28 23:41:25,294: INFO: common: created directory at: artifacts]
[2025-02-28 23:41:25,295: INFO: common: created directory at: artifacts/data_ingestion]
[2025-02-28 23:41:25,296: INFO: 4108454040: File already exists of size: ~ 94 KB]


# Data preprocessing 

In [None]:
# data_preprocessing_model:
#   root_dir: artifacts/data_preprocessing
#   columnTransferPipeline: artifacts/data_preprocessing/Preprocessing_pipeline.joblib
#   outliersPipeline: artifacts/data_preprocessing/outliers_pipeline.joblib

In [66]:
# Entity 

from dataclasses import dataclass 
from pathlib import Path 

@dataclass(frozen=True)
class DataPreProcessingConfig:
    root_dir: Path
    columnTransferPipeline: Path
    outliersPipeline: Path
    cleanDataset: Path

In [None]:
# configuration manager in src config

import warnings 

from loan_default_risk.constants import * 
from loan_default_risk.utils.common import read_yaml, create_directories 

warnings.filterwarnings('ignore')

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):


        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)


        create_directories([self.config.artifacts_root])

    # data preprocessing 💉
    def get_data_preprocessing_config(self) -> DataPreProcessingConfig:
        config = self.config.data_preprocessing_model


        create_directories([config.root_dir])


        data_preprocessing_config = DataPreProcessingConfig(
            root_dir=config.root_dir,
            columnTransferPipeline=config.columnTransferPipeline,
            outliersPipeline=config.outliersPipeline,
            cleanDataset = config.cleanDataset
        )


        return data_preprocessing_config

In [68]:
# Components
import pandas as pd
import os
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from pathlib import Path
from loan_default_risk import logger
from loan_default_risk.utils.common import get_size
from feature_engine.outliers import Winsorizer


# df = pd.read_csv('artifacts/data_ingestion/loanDataset.csv')
class DataPreprocessing:
    def __init__(self, config: DataPreProcessingConfig):
        self.config = config
        self.df = pd.read_csv('artifacts/data_ingestion/loanDataset.csv')

    
    def create_column_transfer_pipeline(self):
        try: 
            # Numeric features
            numeric_features = df.select_dtypes(exclude=['object']).columns

            # Categorical features
            categorical_features = df.select_dtypes(include=['object']).columns

            # Check if the pipeline file already exists
            if not os.path.exists(self.config.columnTransferPipeline):
                # Create numeric pipeline
                num_pipeLine = Pipeline(steps=[
                    ('impute', SimpleImputer(strategy='mean')),
                    ('Scale', MinMaxScaler())
                ])

                # Create categorical pipeline
                encoding_pipeline = Pipeline([
                    ('oneHotEncode', OneHotEncoder(sparse_output=False))
                ])

                # Combine numeric and categorical pipelines
                preprocess_pipeline = ColumnTransformer([
                    ('numeric', num_pipeLine, numeric_features),
                    ('categorical', encoding_pipeline, categorical_features)
                ])

                # Save the pipeline to a file
                joblib.dump(preprocess_pipeline, self.config.columnTransferPipeline)
                logger.info(f"Pipeline saved at: {self.config.columnTransferPipeline}")
            else:
                logger.info(f"File already exists of size: {get_size(Path(self.config.columnTransferPipeline))}")

        except Exception as e:
            # Log any exceptions that occur
            logger.error(f"Error occurred during pipeline creation: {e}")
            raise e 
        
    def outliersPipeline(self):
            try:
                # Check if the outliers pipeline file already exists
                if not os.path.exists(self.config.outliersPipeline):
                    # Define features to handle outliers
                    outliersFeatures = [
                        'numeric__months_loan_duration',
                        'numeric__amount',
                        'numeric__age'
                    ]

                    # Create Winsorizer object
                    winsor = Winsorizer(
                        capping_method='iqr',  # Use IQR rule boundaries
                        tail='both',  # Cap both tails
                        fold=1.5,  # Fold value for IQR
                        variables=outliersFeatures
                    )

                    # Save the Winsorizer pipeline to a file
                    joblib.dump(winsor, self.config.outliersPipeline)
                    logger.info(f"Outliers pipeline saved at: {self.config.outliersPipeline}")
                else:
                    logger.info(f"File already exists of size: {get_size(Path(self.config.outliersPipeline))}")

            except Exception as e:
                # Log any exceptions that occur
                logger.error(f"Error occurred during outliers pipeline creation: {e}")
                raise e  # Re-raise the exception if needed
            
    
    def preprocessingDataset(self):
        try:
            df = self.df
            # Load the preprocessing pipeline
            preprocess = joblib.load(self.config.columnTransferPipeline)

            # Load the outliers pipeline
            outlier = joblib.load(self.config.outliersPipeline)

            # Apply the preprocessing pipeline to the dataset
            df = pd.DataFrame(
                preprocess.fit_transform(df),
                columns=preprocess.get_feature_names_out()
            )

            # Apply the outliers pipeline to specific numeric features
            df[['numeric__months_loan_duration', 'numeric__amount', 'numeric__age']] = outlier.fit_transform(
                df[['numeric__months_loan_duration', 'numeric__amount', 'numeric__age']]
            )

            # Save the cleaned dataset to a CSV file
            if not os.path.exists(self.config.cleanDataset):
                df.to_csv(self.config.cleanDataset, index=False)
                logger.info(f"{self.config.cleanDataset} saved! with following info: cleanDataset.csv")
            else:
                logger.info(f"File already exists of size: {get_size(Path(self.config.cleanDataset))}")

        except Exception as e:
            # Log any exceptions that occur
            logger.error(f"Error occurred during dataset preprocessing: {e}")
            raise e  # Re-raise the exception if needed
        
        
        

In [69]:
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_data_preprocessing_config()
    data_preprocess = DataPreprocessing(config=data_preprocessing_config)
    data_preprocess.create_column_transfer_pipeline()
    data_preprocess.outliersPipeline()
    data_preprocess.preprocessingDataset()
except Exception as e:
    print(e)

[2025-03-01 01:30:35,769: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-01 01:30:35,770: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-01 01:30:35,771: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-01 01:30:35,772: INFO: common: created directory at: artifacts]
[2025-03-01 01:30:35,772: INFO: common: created directory at: artifacts/data_preprocessing]
[2025-03-01 01:30:35,778: INFO: 2538319982: Pipeline saved at: artifacts/data_preprocessing/Preprocessing_pipeline.joblib]
[2025-03-01 01:30:35,779: INFO: 2538319982: Outliers pipeline saved at: artifacts/data_preprocessing/outliers_pipeline.joblib]
[2025-03-01 01:30:35,816: INFO: 2538319982: artifacts/data_preprocessing/preprocessedDataset.csv saved! with following info: cleanDataset.csv]
