In [1]:
from src.Wine.Utils import create_directory,read_yaml,download_data_from_s3,save_object
from src.Wine.loggers import logger
from src.Wine.Exception import CustomException
from src.Wine.Constants import *
import os,sys
from pathlib import Path
from dataclasses import dataclass

In [2]:
#step3)update the entity file --->is nothing we r defining the class variable
#which was used in yaml file and futhure taking rtn as function

@dataclass
class DataTransformationConfig():
    #defining class variable along with dtypes
    root_dir_path:Path
    save_obj_dirpath: Path
    csv_dir_path: Path
    target_column:dict


In [3]:
#updating the configurationmanager file in this file we read yaml file and create directories 
#and assigining values to DataTransformationConfig Class variable 

class ConfigurationManager():
    #creating constructor to initialize the instance variable
    def __init__(self,config_filepath = CONFIG_FILEPATH,param_filepath=PARAM_FILEPATH,schema_filepath=SCHEMA_FILEPATH):
        self.config = read_yaml(config_filepath) #rtn as configbox dictatonary
        self.param =  read_yaml(param_filepath) #rtn as configbox dictionary
        self.schema = read_yaml(schema_filepath) #rtn as configbox dictionary

        #creating artifacts directory in the project structure
        create_directory([self.config.artifacts_root]) #it will create artifacts directory

    def get_data_transformation_config(self):
        #creating local variable which was used inside this method
        transform = self.config.data_transformation
        target_coln = self.schema

        #creating root directory in artifacts folder for datatransformation
        create_directory([transform.root_dir_path]) #create artifacts/data_transformation folder

        #creating an object &
        #assigining the value to DataTransformationConfig class variable and taking rtn as function
        data_transformation_config = DataTransformationConfig(
            root_dir_path=transform.root_dir_path,
            save_obj_dirpath=transform.save_obj_dirpath,
            csv_dir_path=transform.csv_dir_path,
            target_column=target_coln.target_column
        )

        return data_transformation_config

In [4]:
import pandas as pd,numpy as np,sklearn
from sklearn.pipeline import Pipeline #this class we used to create pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer #to fill null value
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin

In [5]:
#step-5) updating the component file of Datatransformation and initializing the class variable as instance variance
class DataTransformation():
    def __init__(self,transformconfig:DataTransformationConfig):
        self.transformconfig = transformconfig #this value we used in our datatransformation stage mei

    def get_data_transformation(self):
        #In this file we create preprocessor object which is futhure used to transformation
        df = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"WineQT.csv"))

        # Extract target column name
        target_column_name = list(self.transformconfig.target_column.keys())[0]

        #selecting input and output variable
        x = df.drop(target_column_name,axis=1)
        y = df[target_column_name]

        #selecting object and numeric column from input variable
        num_feature_lst = x.select_dtypes(exclude='object').columns.to_list()
        cat_feature_lst = x.select_dtypes(include='object').columns.to_list()

        logger.info(f"Numeric column from input feature\n%s",num_feature_lst)
        logger.info(f"Categorical column from input feature\n%s",cat_feature_lst)

        #creating numeric pipeline by using Pipeline class
        numeric_pipeline = Pipeline(steps=[
            ("imputer",SimpleImputer(strategy="median")),#filling the numeric featuere will median
            ("scaling",StandardScaler(with_mean=False))
        ])
        logger.info(f"Numeric Pipeline feature\n%s",numeric_pipeline)

        #creating categorical pipeline by using Pipeline class
        categorical_pipeline = Pipeline(steps=[
            ("imputer",SimpleImputer(strategy="most_frequent")),#filling the categorical feature
            ("onehot",OneHotEncoder(handle_unknown="ignore"))
        ])
        logger.info(f"Categorical Pipeline feature\n%s",categorical_pipeline)

        #combining both pipelien using columntransformer class
        preprocessor = ColumnTransformer(transformers=[
            ("num_pipeline",numeric_pipeline,num_feature_lst),
            ("cat_pipeline",categorical_pipeline,cat_feature_lst)
        ])

        # #now saving the object into artifacts folder
        # save_object(file=self.transformconfig.save_obj_dirpath,obj=preprocessor)

        return preprocessor

    def initiate_data_transformation(self):
        logger.info('Reading train and test Data Using Pandas Library')
        train_data = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"train.csv"))
        test_data = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"train.csv"))

        # Extract target column name
        target_column_name = list(self.transformconfig.target_column.keys())[0]

        #selecting input and output variable from both train,test df object
        train_input_feature = train_data.drop(target_column_name,axis=1)
        train_output_feature = train_data[target_column_name]

        test_input_feature = test_data.drop(target_column_name,axis=1)
        test_output_feature = test_data[target_column_name]

        #calling the preprocessor object
        preprocessor_obj = self.get_data_transformation()

        #now saving the object into artifacts folder
        save_object(file=Path(self.transformconfig.save_obj_dirpath),obj=preprocessor_obj)

        #applying this preprocessor object to input variable only for both train and test df object
        input_feature_train_array  = preprocessor_obj.fit_transform(train_input_feature) #changes to 2d numpy array
        input_feature_test_array  = preprocessor_obj.transform(test_input_feature)  #changes to 2d numpy array

        logger.info('Combining  input feature train array with train_data_output_feature---->to get train_numpy_array')
        train_numpy_array = np.c_[input_feature_train_array,np.array(train_output_feature)]
        test_numpy_array = np.c_[input_feature_test_array,np.array(test_output_feature)]

        return(
            train_numpy_array,
            test_numpy_array
        )

In [6]:
os.chdir('../')
%pwd

'd:\\Wine_ML_AlGO\\WineQualityModel'

In [7]:
#step6)updating the training pipeline
try:
    #creating an object of configuration manager
    cm = ConfigurationManager()
    data_transform_config = cm.get_data_transformation_config()

    #creating an object of datatransformation class
    dt = DataTransformation(transformconfig=data_transform_config)

    dt.get_data_transformation()

    train_array,test_array = dt.initiate_data_transformation()
    

except Exception as e:
    raise CustomException(e,sys)

[2024-09-30 12:09:53,102]-33-Reading yaml file config\config.yaml
[2024-09-30 12:09:53,148]-33-Reading yaml file param.yaml
[2024-09-30 12:09:53,150]-33-Reading yaml file schema.yaml
[2024-09-30 12:09:53,174]-46-Creating Directory
[2024-09-30 12:09:53,176]-46-Creating Directory
[2024-09-30 12:09:53,187]-50-Directory artifacts/data_transformation created
[2024-09-30 12:09:53,533]-21-Numeric column from input feature
['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'Id']
[2024-09-30 12:09:53,535]-22-Categorical column from input feature
[]
[2024-09-30 12:09:53,537]-29-Numeric Pipeline feature
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaling', StandardScaler(with_mean=False))])
[2024-09-30 12:09:53,543]-36-Categorical Pipeline feature
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot'