In [None]:
import os


In [None]:
%pwd

'c:\\Users\\Subil Varghese Jacob\\OneDrive\\Desktop\\Books\\Oil and gas\\Well_Optimization_ML\\research'

In [7]:
os.chdir("../")

In [8]:
%pwd

'c:\\Users\\Subil Varghese Jacob\\OneDrive\\Desktop\\Books\\Oil and gas\\Well_Optimization_ML'

In [None]:
from pathlib import Path
from dataclasses import dataclass


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    train_data_path: Path
    test_data_path : Path
    train_model_data: Path
    test_model_data: Path
    preprocessor_obj_file_path: Path

In [None]:
from mlProject.constants import *
from mlProject.utils.common import read_yaml,create_directories,save_object

class ConfigurationManager:
    def __init__(self,
                 config_filepath= CONFIG_FILE_PATH,
                 schema_filepath= SCHEMA_FILE_PATH,
                 param_filepath = PARAM_FILE_PATH):
        self.config=read_yaml(CONFIG_FILE_PATH)
        self.params=read_yaml(PARAM_FILE_PATH)
        self.schema=read_yaml(SCHEMA_FILE_PATH)

        create_directories([self.config.artifacts_roots])

    def get_data_transformation_config(self)->DataTransformationConfig:
        config=self.config.data_transformation

        create_directories([config.root_dir])


        data_transformation_config=DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            train_model_data=config.train_model_data,
            test_model_data= config.test_model_data,
            preprocessor_obj_file_path= config.preprocessor_obj_file_path

        )

        return data_transformation_config




In [32]:
import os
from mlProject import logger
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import pandas as pd
import numpy as np


In [None]:
class DataTransfromation:
    def __init__(self,config:DataTransformationConfig):
        self.config=config

    def train_test_spliting(self):
        data=pd.read_csv(self.config.data_path)

        ## Split data into train and split set (0.75,0.25)
        train,test=train_test_split(data)
        train.to_csv(self.config.train_data_path,index=False)
        test.to_csv(self.config.test_data_path,index=False)

        logger.info("Splitted data into train and test data")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
    
    def get_data_transformation(self):
        try:
            numerical_fetures=['permeability_md', 'porosity_fraction', 'net_to_gross', 'thickness_ft',
       'well_depth_ft', 'tubing_diameter_in', 'choke_size_64th',
       'reservoir_pressure_psi', 'reservoir_temp_f', 'bottomhole_pressure_psi',
       'wellhead_pressure_psi', 'oil_gravity_api', 'gas_oil_ratio_scf_bbl',
       'water_cut_fraction', 'fvf_oil', 'oil_viscosity_cp', 'oil_rate_bbl_day',
       'gas_rate_scf_day', 'water_rate_bbl_day', 'productivity_index',
       'oil_price_usd_bbl', 'gas_price_usd_mcf', 'daily_opex_usd',
       'drilling_cost_usd', 'completion_cost_usd', 'total_capex_usd',
       'daily_revenue_usd', 'oil_cut', 'profit_per_barrel',
       'production_efficiency', 'economic_efficiency', 'ranking_score',
       'well_age_days', 'production_months', 'days_since_workover',
       'pressure_drawdown', 'total_liquid_rate', 'productivity_factor']
            
            cat_features=['well_type', 'completion_type', 'artificial_lift',
       'depth_category']
            logger.info(f"Numerical Features {numerical_fetures}")
            logger.info(f"Categorical Features { cat_features}")


            num_transformer=StandardScaler()
            oh_transformer=OneHotEncoder()

            preprocessor=ColumnTransformer([
                ("OneHotEncoder",oh_transformer,cat_features),
                ("StandardScaler",num_transformer,numerical_fetures)
            ])

            return preprocessor
        except Exception as e:
            raise e

    def intiate_data_transfromation(self):
        try:
            train_df=pd.read_csv(self.config.train_data_path)
            test_df=pd.read_csv(self.config.test_data_path)

            logger.info("Reading train and test data completed")

            logger.info("Getting processing object")
            preprocessing_obj=self.get_data_transformation()

            target_column_name='performance_index'

            input_feature_train_df=train_df.drop(columns=[target_column_name],axis=1)
            target_feature_train_df=train_df[target_column_name]

            input_feature_test_df=test_df.drop(columns=[target_column_name],axis=1)
            target_feature_test_df=test_df[target_column_name]

            logger.info(f"Applying processsing object on training dataframe and testing dataframe")

            input_feature_train_arr=preprocessing_obj.fit_transform(input_feature_train_df)
            input_feature_test_arr=preprocessing_obj.transform(input_feature_test_df)

            train_arr=np.c_[
                input_feature_train_arr,np.array(target_feature_train_df)
            ]
            test_arr=np.c_[input_feature_test_arr,np.array(target_feature_test_df)]

            train_data=pd.DataFrame(train_arr)
            test_data=pd.DataFrame(test_arr)

            train_data.to_csv(self.config.train_model_data,index=False)
            test_data.to_csv(self.config.test_model_data,index=False)
            

            save_object(
                file_path=self.config.preprocessor_obj_file_path,obj=preprocessing_obj
            )

            return(
                train_arr,
                test_arr,
                self.config.preprocessor_obj_file_path
            )

            
        except Exception as e:
            raise e



In [41]:
try:
    config=ConfigurationManager()
    data_transfromation_config=config.get_data_transformation_config()
    data_transformation=DataTransfromation(config=data_transfromation_config)
    data_transformation.train_test_spliting()
    data_transformation.get_data_transformation()
    data_transformation.intiate_data_transfromation()
except Exception as e:
    raise e

[2025-11-09 22:34:11,691:INFO:common:yaml file <_io.TextIOWrapper name='config\\config.yaml' mode='r' encoding='utf-8'> loaded succesfully]
[2025-11-09 22:34:11,693:INFO:common:yaml file <_io.TextIOWrapper name='params.yaml' mode='r' encoding='utf-8'> loaded succesfully]
[2025-11-09 22:34:11,696:INFO:common:yaml file <_io.TextIOWrapper name='schema.yaml' mode='r' encoding='utf-8'> loaded succesfully]
[2025-11-09 22:34:11,697:INFO:common:created directories artifacts]
[2025-11-09 22:34:11,698:INFO:common:created directories artifacts/data_transformation]
[2025-11-09 22:34:11,726:INFO:1406836251:Splitted data into train and test data]
[2025-11-09 22:34:11,727:INFO:1406836251:(375, 45)]
[2025-11-09 22:34:11,727:INFO:1406836251:(125, 45)]
(375, 45)
(125, 45)
[2025-11-09 22:34:11,728:INFO:1406836251:Numerical Features ['permeability_md', 'porosity_fraction', 'net_to_gross', 'thickness_ft', 'well_depth_ft', 'tubing_diameter_in', 'choke_size_64th', 'reservoir_pressure_psi', 'reservoir_temp_f'