In [1]:
from src.Wine.Utils import create_directory,read_yaml,download_data_from_s3,save_object
from src.Wine.loggers import logger
from src.Wine.Exception import CustomException
from src.Wine.Constants import *
import os,sys
from pathlib import Path
from dataclasses import dataclass
from xgboost import XGBRegressor

In [2]:
#for training the model we gonna used hyperparameter tuning technique
#(known as grid search cv-->will used all the parameter for training it reqired more time to compute the parameter
#that we have defined in yaml and identify which parameter will good for this dataset
#or random search cv randomly used the parameter to training purpose comparatively faster than grid)
#in which i am gonna used 4 algorithm for this dataset
#elasticnet,random forest regressor,support vector regressor,decision tree regressor,boosting regressor

#calling the algorithm class
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
import pandas as pd,numpy as np

In [3]:
#step 3) update the entity file:- entity file is nothing but whatever parameter we have used in yaml file 
#we gonna defined them as a class variable
@dataclass
class ModelTrainingConfig():
    #defining the class variable
    root_dir_path:Path
    save_best_model_dirpath:Path
    all_param:dict


@dataclass
class DataTransformationConfig():
    #defining class variable along with dtypes
    root_dir_path:Path
    save_obj_dirpath: Path
    csv_dir_path: Path
    target_column:dict


In [4]:
#step4)update the configuration manager file in this file we read yaml file and create directory 
#and creating object of class variable and assigning value to 
#the class variable parameter and taking rtn as fuctn
class ConfigurationManager():
    #creating constructor to initialize the instance variable
    def __init__(self,config_filepath = CONFIG_FILEPATH,param_filepath=PARAM_FILEPATH,schema_filepath=SCHEMA_FILEPATH):
        self.config = read_yaml(config_filepath) #this will rtn the value as configbox dict
        self.param = read_yaml(param_filepath)
        self.schema = read_yaml(schema_filepath)

        #creating artifact directory in project structure
        create_directory([self.config.artifacts_root])


    def get_data_transformation_config(self):
        #creating local variable which was used inside this method
        transform = self.config.data_transformation
        target_coln = self.schema

        #creating root directory in artifacts folder for datatransformation
        create_directory([transform.root_dir_path]) #create artifacts/data_transformation folder

        #creating an object &
        #assigining the value to DataTransformationConfig class variable and taking rtn as function
        data_transformation_config = DataTransformationConfig(
            root_dir_path=transform.root_dir_path,
            save_obj_dirpath=transform.save_obj_dirpath,
            csv_dir_path=transform.csv_dir_path,
            target_column=target_coln.target_column
        )

        return data_transformation_config
    
    

    #another method we used to get model training config!!!
    def get_model_training_config(self) ->ModelTrainingConfig:
        #initializing the local variable which is used inside this method only
        config = self.config.model_training
        param = self.param

        #creating directory artifacts/model_training
        create_directory([config.root_dir_path])

        #creating an object of class variable and assigning value to parameter and taking rtn as fuctn
        model_training_config = ModelTrainingConfig(
            root_dir_path = config.root_dir_path,
            save_best_model_dirpath=config.save_best_model_dirpath,
            all_param=param
        )
        return model_training_config

In [5]:
import pandas as pd,numpy as np,sklearn
from sklearn.pipeline import Pipeline #this class we used to create pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer #to fill null value
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin


#step-5) updating the component file of Datatransformation and initializing the class variable as instance variance
class DataTransformation():
    def __init__(self,transformconfig:DataTransformationConfig):
        self.transformconfig = transformconfig #this value we used in our datatransformation stage mei

    def get_data_transformation(self):
        #In this file we create preprocessor object which is futhure used to transformation
        df = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"WineQT.csv"))

        # Extract target column name
        target_column_name = list(self.transformconfig.target_column.keys())[0]

        #selecting input and output variable
        x = df.drop(target_column_name,axis=1)
        y = df[target_column_name]

        #selecting object and numeric column from input variable
        num_feature_lst = x.select_dtypes(exclude='object').columns.to_list()
        cat_feature_lst = x.select_dtypes(include='object').columns.to_list()

        logger.info(f"Numeric column from input feature\n%s",num_feature_lst)
        logger.info(f"Categorical column from input feature\n%s",cat_feature_lst)

        #creating numeric pipeline by using Pipeline class
        numeric_pipeline = Pipeline(steps=[
            ("imputer",SimpleImputer(strategy="median")),#filling the numeric featuere will median
            ("scaling",StandardScaler(with_mean=False))
        ])
        logger.info(f"Numeric Pipeline feature\n%s",numeric_pipeline)

        #creating categorical pipeline by using Pipeline class
        categorical_pipeline = Pipeline(steps=[
            ("imputer",SimpleImputer(strategy="most_frequent")),#filling the categorical feature
            ("onehot",OneHotEncoder(handle_unknown="ignore"))
        ])
        logger.info(f"Categorical Pipeline feature\n%s",categorical_pipeline)

        #combining both pipelien using columntransformer class
        preprocessor = ColumnTransformer(transformers=[
            ("num_pipeline",numeric_pipeline,num_feature_lst),
            ("cat_pipeline",categorical_pipeline,cat_feature_lst)
        ])

        # #now saving the object into artifacts folder
        # save_object(file=self.transformconfig.save_obj_dirpath,obj=preprocessor)

        return preprocessor

    def initiate_data_transformation(self):
        logger.info('Reading train and test Data Using Pandas Library')
        train_data = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"train.csv"))
        test_data = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"train.csv"))

        # Extract target column name
        target_column_name = list(self.transformconfig.target_column.keys())[0]

        #selecting input and output variable from both train,test df object
        train_input_feature = train_data.drop(target_column_name,axis=1)
        train_output_feature = train_data[target_column_name]

        test_input_feature = test_data.drop(target_column_name,axis=1)
        test_output_feature = test_data[target_column_name]

        #calling the preprocessor object
        preprocessor_obj = self.get_data_transformation()

        #now saving the object into artifacts folder
        save_object(file=Path(self.transformconfig.save_obj_dirpath),obj=preprocessor_obj)

        #applying this preprocessor object to input variable only for both train and test df object
        input_feature_train_array  = preprocessor_obj.fit_transform(train_input_feature) #changes to 2d numpy array
        input_feature_test_array  = preprocessor_obj.transform(test_input_feature)  #changes to 2d numpy array

        logger.info('Combining  input feature train array with train_data_output_feature---->to get train_numpy_array')
        train_numpy_array = np.c_[input_feature_train_array,np.array(train_output_feature)]
        test_numpy_array = np.c_[input_feature_test_array,np.array(test_output_feature)]

        return(
            train_numpy_array,
            test_numpy_array
        )

In [6]:
#step5) update the component file: In this file we create an object for class varibale
#and perform model training task accordingly
class ModelTraining():
    def __init__(self,modelconfig:ModelTrainingConfig):
        self.modelconfig = modelconfig

    #create kar raha hu method to perform training of model through grid search cv and get best param from it
    def initiate_model_training(self,train_numpy_array,test_numpy_array):

        #now selecting training and testing data from numpy array obj
        x_train, y_train = train_numpy_array[:,:-1],train_numpy_array[:,-1]
        x_test, y_test = test_numpy_array[:,:-1],test_numpy_array[:,-1]

        #now defining model which i was using for this dataset
        models = {
            "ElasticNet": ElasticNet(),
            "DecisionTreeRegressor": DecisionTreeRegressor(),
            "RandomForestRegressor": RandomForestRegressor(),
            "GradientBoostingRegressor": GradientBoostingRegressor(),
            "SVR":SVR(),
            "XGBRegressor":XGBRegressor()
        }
        
        #saving the best models file in dictatonary object
        model_report = {}

        #using items built in method of dict object to get key and value from it!!!
        for model_name,model in models.items():
            logger.info(f'Training model: {model_name}')

            #Using Grid search CV to find out best parameter and model for this dataset 
            #creating an object of  Grid search CV class
            grid_search_cv = GridSearchCV(
                estimator=model,
                param_grid=self.modelconfig.all_param.get(model_name, {}),
                scoring="neg_mean_squared_error",  # or "r2" for R² score
                n_jobs=-1,
                cv=5,
                verbose=1
            )

            #now gridsearch cv fit the model and findout best hyperpaprameters
            grid_search_cv.fit(x_train, y_train)

            #after training done getting best model and score from it
            logger.info(f"Best model {model_name} and Best hyperpaprameter Value is {grid_search_cv.best_params_}")
            logger.info(f"Best model Score {model_name} is : {grid_search_cv.best_score_}")


            #now setting the best hyperparameter value to this model
            model.set_params(**grid_search_cv.best_params_)

            #now training the model by 80% training data
            model.fit(x_train, y_train)

            #predicting the output variable using 20% test data
            y_pred = model.predict(x_test)

            #evaluating the accuracy of model saving them into dict obj
            from sklearn.metrics import mean_squared_error, r2_score
            model_report[model_name] = {
                "r2": float(r2_score(y_test, y_pred)),
                "mse": float(mean_squared_error(y_test,y_pred)),
                "rmse": float(np.sqrt(mean_squared_error(y_test,y_pred)))
            }

            print(model_report)

            # Get the best model based on R² score
            best_model_name = max(model_report, key=lambda x: model_report[x]["r2"])
            best_model_score = model_report[best_model_name]["r2"]
            
            #now passing the condition if best_model_score >0.85 then only show
            if best_model_score > 0.80:
                logger.info(f"Best model is {best_model_name} and best score is {best_model_score}")
                #now saving the object into artifacts folder
                save_object(
                    file=Path(self.modelcobnfig.save_best_model_dirpath),
                    obj = models[best_model_name]
                )

            else:
                logger.info('No Best Score Found')


In [None]:
os.chdir('../')
%pwd

In [None]:
#step6) update the pipeline file
try:
    cm = ConfigurationManager() #object of configuration manager class
    data_transform_config = cm.get_data_transformation_config()
    model_config = cm.get_model_training_config()

    #creating an object of datatransformation class
    dt = DataTransformation(transformconfig=data_transform_config)

    dt.get_data_transformation()

    train_array,test_array = dt.initiate_data_transformation()

    #creating an object of ModelTraining clas
    mt = ModelTraining(model_config)
    mt.initiate_model_training(
        train_numpy_array=train_array,
        test_numpy_array=test_array,
    )


except Exception as e:
    raise CustomException(e,sys)