In [1]:
from src.NEFT50.Utils import Create_Directory,read_yaml_file,Save_object
from src.NEFT50.loggers import logger
from src.NEFT50.Exception import CustomException
from src.NEFT50.Constants import CONFIG_FILEPATH,PARAM_FILEPATH
import os,sys
from pathlib import Path
from dataclasses import dataclass
from sklearn.linear_model import LinearRegression

In [2]:
#step 3) update the entity file:- entity file is nothing but whatever parameter we have used in yaml file 
#we gonna defined them as a class variable
@dataclass
class ModelTrainingConfig():
    #defining the class variable
    root_dir_path:Path
    save_best_model_dirpath:Path


@dataclass
class DataTransformationConfig():
    #defining class variable along with dtypes
    root_dir_path:Path
    save_obj_dirpath: Path
    csv_dir_path: Path



In [3]:
#updating the configurationmanager file in this file we read yaml file and create directories 
#and assigining values to DataTransformationConfig Class variable 

class ConfigurationManager():
    #creating constructor to initialize the instance variable
    def __init__(self,config_filepath = CONFIG_FILEPATH,param_filepath=PARAM_FILEPATH):
        self.config = read_yaml_file(config_filepath) #rtn as configbox dictatonary
        self.param =  read_yaml_file(param_filepath) #rtn as configbox dictionary
    

        #creating artifacts directory in the project structure
        Create_Directory([self.config.artifacts_root]) #it will create artifacts directory

    def get_data_transformation_config(self):
        #creating local variable which was used inside this method
        transform = self.config.data_transformation

        #creating root directory in artifacts folder for datatransformation
        Create_Directory([transform.root_dir_path]) #create artifacts/data_transformation folder

        #creating an object &
        #assigining the value to DataTransformationConfig class variable and taking rtn as function
        data_transformation_config = DataTransformationConfig(
            root_dir_path=transform.root_dir_path,
            save_obj_dirpath=transform.save_obj_dirpath,
            csv_dir_path=transform.csv_dir_path,
           
        )

        return data_transformation_config
    
    
    #another method we used to get model training config!!!
    def get_model_training_config(self) ->ModelTrainingConfig:
        #initializing the local variable which is used inside this method only
        config = self.config.model_training
      

        #creating directory artifacts/model_training
        Create_Directory([config.root_dir_path])

        #creating an object of class variable and assigning value to parameter and taking rtn as fuctn
        model_training_config = ModelTrainingConfig(
            root_dir_path = config.root_dir_path,
            save_best_model_dirpath=config.save_best_model_dirpath,
     
        )
        return model_training_config

In [4]:
import pandas as pd,numpy as np,sklearn
from sklearn.pipeline import Pipeline #this class we used to create pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer #to fill null value
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin

In [5]:
#step-5) updating the component file of Datatransformation and initializing the class variable as instance variance
class DataTransformation():
    def __init__(self,transformconfig:DataTransformationConfig):
        self.transformconfig = transformconfig #this value we used in our datatransformation stage mei

    def get_data_transformation(self):
        #In this file we create preprocessor object which is futhure used to transformation
        df = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"train.csv"))
        
        target_column_name = "target"

        #selecting input and output variable
        x = df[['open', 'high', 'low', 'close', 'volume']]
        y = df[target_column_name]

        #selecting object and numeric column from input variable
        num_feature_lst = x.select_dtypes(exclude='object').columns.to_list()
        cat_feature_lst = x.select_dtypes(include='object').columns.to_list()

        logger.info(f"Numeric column from input feature\n%s",num_feature_lst)
        logger.info(f"Categorical column from input feature\n%s",cat_feature_lst)

        #creating numeric pipeline by using Pipeline class
        numeric_pipeline = Pipeline(steps=[
            ("imputer",SimpleImputer(strategy="median")),#filling the numeric featuere will median
            ("scaling",StandardScaler(with_mean=False))
        ])
        logger.info(f"Numeric Pipeline feature\n%s",numeric_pipeline)

        #creating categorical pipeline by using Pipeline class
        categorical_pipeline = Pipeline(steps=[
            ("imputer",SimpleImputer(strategy="most_frequent")),#filling the categorical feature
            ("onehot",OneHotEncoder(handle_unknown="ignore"))
        ])
        logger.info(f"Categorical Pipeline feature\n%s",categorical_pipeline)

        #combining both pipelien using columntransformer class
        preprocessor = ColumnTransformer(transformers=[
            ("num_pipeline",numeric_pipeline,num_feature_lst),
            ("cat_pipeline",categorical_pipeline,cat_feature_lst)
        ])

        # #now saving the object into artifacts folder
        # save_object(file=self.transformconfig.save_obj_dirpath,obj=preprocessor)

        return preprocessor

    def initiate_data_transformation(self):
        logger.info('Reading train and test Data Using Pandas Library')
        train_data = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"train.csv"))
        test_data = pd.read_csv(os.path.join(self.transformconfig.csv_dir_path,"test.csv"))
        
        
        # Extract target column name
        target_column_name = "target"

        #selecting input and output variable from both train,test df object
        train_input_feature = train_data.drop(target_column_name,axis=1)
        train_output_feature = train_data[target_column_name]

        test_input_feature = test_data.drop(target_column_name,axis=1)
        test_output_feature = test_data[target_column_name]

        #calling the preprocessor object
        preprocessor_obj = self.get_data_transformation()

        #now saving the object into artifacts folder
        Save_object(filepath=Path(self.transformconfig.save_obj_dirpath),object=preprocessor_obj)

        #applying this preprocessor object to input variable only for both train and test df object
        input_feature_train_array  = preprocessor_obj.fit_transform(train_input_feature) #changes to 2d numpy array
        input_feature_test_array  = preprocessor_obj.transform(test_input_feature)  #changes to 2d numpy array

        logger.info('Combining  input feature train array with train_data_output_feature---->to get train_numpy_array')
        train_numpy_array = np.c_[input_feature_train_array,np.array(train_output_feature)]
        test_numpy_array = np.c_[input_feature_test_array,np.array(test_output_feature)]

        return(
            train_numpy_array,
            test_numpy_array
        )

In [6]:
#step5) update the component file: In this file we create an object for class varibale
#and perform model training task accordingly
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

class ModelTraining():
    def __init__(self,modelconfig:ModelTrainingConfig):
        self.modelconfig = modelconfig

    #create kar raha hu method to perform training of model through grid search cv and get best param from it
    def initiate_model_training(self,train_numpy_array,test_numpy_array):

        #now selecting training and testing data from numpy array obj
        x_train, y_train = train_numpy_array[:,:-1],train_numpy_array[:,-1]
        x_test, y_test = test_numpy_array[:,:-1],test_numpy_array[:,-1]

        # Initialize Linear Regression model
        model = LinearRegression()

        # Train model
        model.fit(x_train, y_train)
        logger.info("Model training completed successfully.")
        
        # Make predictions on test set
        y_pred = model.predict(x_test)

        # Evaluate model performance
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        logger.info(f"Linear Regression Performance -> R2: {r2:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")
        
        # Save model only if performance is acceptable
        if r2 > 0.50:
            Save_object(
                filepath=Path(self.modelconfig.save_best_model_dirpath),
                object=model
            )
            logger.info("Model saved successfully as best model.")
        else:
            logger.warning("Model R² score below threshold (0.50). Not saving the model.")

        

In [7]:
os.chdir('../')
%pwd

'd:\\NEFT50_RegressionModel'

In [8]:
#step6) update the pipeline file
try:
    cm = ConfigurationManager() #object of configuration manager class
    
    #creating an object of dataclasses
    data_transform_config = cm.get_data_transformation_config()
    model_config = cm.get_model_training_config()

    #creating an object of datatransformation class
    dt = DataTransformation(transformconfig=data_transform_config)

    dt.get_data_transformation()

    train_array,test_array = dt.initiate_data_transformation()

    #creating an object of ModelTraining clas
    mt = ModelTraining(model_config)
    mt.initiate_model_training(
        train_numpy_array=train_array,
        test_numpy_array=test_array,
    )


except Exception as e:
    raise CustomException(e,sys)

[2025-10-06 17:34:12,483]-INFO-19-Reading the YAML file config\config.yaml
{'artifacts_root': 'artifacts', 'data_ingestion': {'root_dir_path': 'artifacts/data_ingestion', 'train_test_path': 'artifacts/data_ingestion/', 'raw_file_path': 'D:\\NEFT50_RegressionModel\\INDIA VIX_minute.csv'}, 'data_transformation': {'root_dir_path': 'artifacts/data_transformation', 'save_obj_dirpath': 'artifacts/data_transformation/preprocessor.pkl', 'csv_dir_path': 'artifacts/data_ingestion/'}, 'model_training': {'root_dir_path': 'artifacts/model_training', 'save_best_model_dirpath': 'artifacts/model_training/models.pkl'}}
[2025-10-06 17:34:12,486]-INFO-23-YAML file read successfully: config\config.yaml
[2025-10-06 17:34:12,487]-INFO-19-Reading the YAML file param.yaml
{'test_key': 'test_value'}
[2025-10-06 17:34:12,488]-INFO-23-YAML file read successfully: param.yaml
[2025-10-06 17:34:12,489]-INFO-33-Creating Directory
[2025-10-06 17:34:12,489]-INFO-33-Creating Directory
[2025-10-06 17:34:12,490]-INFO-33-