In [76]:
import os

In [77]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\Loan-Amount-Prediction'

In [78]:
os.chdir("../")

In [79]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [80]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    

In [81]:
#import from constants and utils
from credit_risk.constants import *
from credit_risk.utils.common import read_yaml, create_directories

In [82]:

#creating a Configuration class
class ConfigurationManager:
    def __init__(
        self,
        config_filepath   = CONFIG_FILE_PATH,
        params_filepath   = PARAMS_FILE_PATH,
        schema_filepath   = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)                       
        self.schema = read_yaml(schema_filepath)  
        
        create_directories([self.config.artifacts_root])
        
    def get_data_transformation_config(self)->DataTransformationConfig:
        config = self.config.data_transformation
        
        create_directories([config.root_dir])
        
        data_transformation_config = DataTransformationConfig(
            root_dir  = config.root_dir,
            data_path = config.data_path,
        )
        
        return data_transformation_config
    
       

In [83]:
import os
import pandas as pd 
from sklearn.model_selection import train_test_split
from credit_risk import logger
import matplotlib.pyplot as plt
import seaborn as sns   



In [84]:
project_path = "C:/Users/RICH-FILES/Desktop/ml/Loan-Amount-Prediction"

os.chdir(project_path)

In [85]:
   #transform categorical data and standardize the data
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline   
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
import joblib

        

In [86]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
import logging

logger = logging.getLogger(__name__)

class DataTransformationConfig:
    def __init__(self, data_path, model_path):
        self.data_path = data_path
        self.model_path = model_path

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def data_cleaning(self):
        data = pd.read_csv(self.config.data_path)
        
        # Remove columns which are not necessary for the analysis
        data.drop(columns=["Id", "Status", "Default"], inplace=True)
        
        # Drop null values
        data.dropna(inplace=True)
        
        logger.info("Null values dropped")
        
        # Remove outliers
        data = data[(data['Age'] < 80) & (data['Emp_length'] < 10) & (data['Income'] < 948000)]
        
        logger.info("Data cleaning complete")
        
        return data
    
    def exploratory_data_analysis(self, data):
        # Check descriptive statistics
        print(data.describe())
        
        # Check non-numeric columns
        print(data.describe(include='object'))
        
        # Check the target variable
        data['Amount'].hist()
        plt.ylabel('Count')
        plt.xlabel('Amount')    
        plt.title('Loan Amount Distribution')
        plt.show()
        
        print("The distribution is right-skewed, meaning most loan amounts fall in the lower range (below 10,000), while fewer loans exist at higher amounts")
        
        # Calculate Amount distribution by Age   
        plt.figure(figsize=(12,6))
        sns.scatterplot(x='Age', y='Amount', data=data) 
        plt.xlabel('Age')
        plt.ylabel('Amount')            
        plt.title('Loan Amount by Age')
        plt.show()
        
        # Calculate Amount distribution by Income
        plt.figure(figsize=(12,6))
        sns.scatterplot(x='Income', y='Amount', data=data)    
        plt.xlabel('Income')
        plt.ylabel('Amount')
        plt.title('Loan Amount by Income')
        plt.show()
        
        # Loan purpose count
        plt.figure(figsize=(12,6))
        data["Intent"].value_counts().plot(kind='bar')
        plt.ylabel('Count')
        plt.xlabel('Intent')
        plt.title('Loan Intent Distribution')
        plt.show()
        
        # Check multicollinearity and correlation
        plt.figure(figsize=(12,6))  
        corr = data.select_dtypes(include=['int64', 'float64']).drop('Amount', axis=1).corr()    
        sns.heatmap(corr, annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix')
        plt.show()
        
        # Drop column Id and Cred_length (if they exist)
        columns_to_drop = ["Cred_length"]
        data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)
        
        pd.options.mode.copy_on_write = True
        print(data.head())
        
        return data
    
    def feat_engineering(self, data):
        
        # Define categorical and numerical features
        cat_features = ["Home", "Intent"]
        num_features = ["Age", "Income", "Emp_length", "Amount", "Rate", "Percent_income"]
        
        # Implement the column transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
                ("num", StandardScaler(), num_features)
            ]
        )   
        
        pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
 
        # Fit the pipeline
        pipeline.fit(data)
        
        # Save the pipeline
        joblib.dump(pipeline, self.config.model_path)
        
        # Transform the data
        transformed_data = pipeline.transform(data)
        
        # Create DataFrame from the transformed data
        transformed_df = pd.DataFrame(transformed_data, columns=num_features + preprocessor.named_transformers_["cat"].get_feature_names_out().tolist())
        transformed_df.to_csv("artifacts/data_ingestion/credit_risk.csv", index=False)     
        
        return transformed_df
        
        
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import joblib
import os
import logging

logger = logging.getLogger(__name__)

class DataTransformationConfig:
    def __init__(self, data_path, model_path, root_dir):
        self.data_path = data_path
        self.model_path = model_path
        self.root_dir = root_dir

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def data_cleaning(self):
        data = pd.read_csv(self.config.data_path)
        
        # Remove columns which are not necessary for the analysis
        data.drop(columns=["Id", "Status", "Default"], inplace=True)
        
        # Drop null values
        data.dropna(inplace=True)
        
        logger.info("Null values dropped")
        
        # Remove outliers
        data = data[(data['Age'] < 80) & (data['Emp_length'] < 10) & (data['Income'] < 948000)]
        
        logger.info("Data cleaning complete")
        
        return data
    
    def exploratory_data_analysis(data):
        # Check descriptive statistics
        print(data.describe())
        
        # Check non-numeric columns
        print(data.describe(include='object'))
        
        # Check the target variable
        data['Amount'].hist()
        plt.ylabel('Count')
        plt.xlabel('Amount')    
        plt.title('Loan Amount Distribution')
        plt.show()
        
        print("The distribution is right-skewed, meaning most loan amounts fall in the lower range (below 10,000), while fewer loans exist at higher amounts.")
        
        # Calculate Amount distribution by Age   
        plt.figure(figsize=(12, 6))
        sns.scatterplot(x='Age', y='Amount', data=data) 
        plt.xlabel('Age')
        plt.ylabel('Amount')            
        plt.title('Loan Amount by Age')
        plt.show()
        
        # Calculate Amount distribution by Income
        plt.figure(figsize=(12, 6))
        sns.scatterplot(x='Income', y='Amount', data=data)    
        plt.xlabel('Income')
        plt.ylabel('Amount')
        plt.title('Loan Amount by Income')
        plt.show()
        
        # Loan purpose count
        plt.figure(figsize=(12, 6))
        data["Intent"].value_counts().plot(kind='bar')
        plt.ylabel('Count')
        plt.xlabel('Intent')
        plt.title('Loan Intent Distribution')
        plt.show()
        
        # Check multicollinearity and correlation
        plt.figure(figsize=(12, 6))  
        corr = data.select_dtypes(include=['int64', 'float64']).drop('Amount', axis=1).corr()    
        sns.heatmap(corr, annot=True, cmap='coolwarm')
        plt.title('Correlation Matrix')
        plt.show()
        
        # Drop column Cred_length (if it exists)
        columns_to_drop = ["Cred_length"]
        data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)
        
        pd.options.mode.copy_on_write = True
        print(data.head())
        
        return data
    
    def feat_engineering(self, data):
        
        # Define categorical and numerical features
        cat_features = ["Home", "Intent"]
        num_features = ["Age", "Income", "Emp_length", "Amount", "Rate", "Percent_income"]
        
        # Implement the column transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
                ("num", StandardScaler(), num_features)
            ]
        )   
        
        pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
 
        # Fit the pipeline
        pipeline.fit(data)
        
        # Save the pipeline
        joblib.dump(pipeline, self.config.model_path)
        
        # Transform the data
        transformed_data = pipeline.transform(data)
        
        # Create DataFrame from the transformed data
        transformed_df = pd.DataFrame(transformed_data, columns=num_features + preprocessor.named_transformers_["cat"].get_feature_names_out().tolist())
        transformed_csv_path = os.path.join(self.config.root_dir, "credit_risk.csv")
        transformed_df.to_csv(transformed_csv_path, index=False)     
        
        return transformed_csv_path
    
    def train_test_splitting(self, data_path):
        data = pd.read_csv(data_path)
        
        # Split the data into train and test
        train, test = train_test_split(data, test_size=0.2, random_state=42)  
        
        train.to_csv(os.path.join(self.config.root_dir, 'train.csv'), index=False)
        test.to_csv(os.path.join(self.config.root_dir, 'test.csv'), index=False) 
        
        # Save the train and test data to the root directory
        logger.info("Data split into train and test data")  
        logger.info(f"Train data shape: {train.shape}")         
        logger.info(f"Test data shape: {test.shape}")  
        
        print(train.shape)
        print(test.shape)



In [88]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config() 
    data_transformation = DataTransformation(config = data_transformation_config)
    # Perform data transformation steps
    cleaned_data = data_transformation.data_cleaning()
    analyzed_data = data_transformation.exploratory_data_analysis(cleaned_data)
    transformed_data = data_transformation.feat_engineering(analyzed_data)
    train_data, test_data = data_transformation.train_test_splitting(transformed_data)
except Exception as e:
    raise e

[2025-03-19 22:29:49,572: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-03-19 22:29:49,576: INFO: common: yaml file: params.yaml loaded successfully]
[2025-03-19 22:29:49,582: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-03-19 22:29:49,584: INFO: common: created directory at: artifacts]
[2025-03-19 22:29:49,584: INFO: common: created directory at: artifacts/data_transformation]


TypeError: DataTransformationConfig.__init__() missing 1 required positional argument: 'model_path'