In [2]:
import os 
os.chdir("/Users/ruchirkanthgandikota/Documents/ML Projects/Wine Quality/end-to-end-mlproject")
%pwd

'/Users/ruchirkanthgandikota/Documents/ML Projects/Wine Quality/end-to-end-mlproject'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class TransformationConfig:
    root_dir: Path
    data_path: Path

In [4]:
from mlproject.constants import *
from mlproject.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> TransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = TransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [6]:
import os
from mlproject import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
class DataTransformation:
    def __init__(self, config: TransformationConfig):
        self.config = config

    def convert_target_variable(self):
        data = pd.read_csv(self.config.data_path)
        # data['quality'] = data['quality'].apply(lambda x: 1 if x >= 7 else 0)
        return data
    
    def handle_outliers_iqr(df):
    # Copy the DataFrame to avoid modifying the original
        df_cleaned = df.copy()
        
        # Loop through each numerical column
        for column in df_cleaned.columns[:-1]:  # Exclude 'quality' since it's the target
            Q1 = df_cleaned[column].quantile(0.25)
            Q3 = df_cleaned[column].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define bounds for outliers
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Replace outliers with the median
            median = df_cleaned[column].median()
            df_cleaned[column] = df_cleaned[column].apply(
                lambda x: median if (x < lower_bound or x > upper_bound) else x
            )
        
        df_cleaned['quality'] = df['quality']
        return df_cleaned



    def train_test_spliting(self, data):

        # Split the data into training and test sets. (0.75, 0.25) split.
        train, test = train_test_split(data)

        train.to_csv(os.path.join(self.config.root_dir, "train.csv"),index = False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"),index = False)

        logger.info("Splited data into training and test sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)
        

In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data = data_transformation.convert_target_variable
    data = data_transformation.handle_outliers_iqr(data)
    data_transformation.train_test_spliting(data)
except Exception as e:
    raise e

[2025-02-14 12:16:19,078: INFO: common: yaml file: config/config.yml loaded successfully]
[2025-02-14 12:16:19,079: INFO: common: yaml file: params.yaml loaded successfully]
[2025-02-14 12:16:19,079: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-02-14 12:16:19,080: INFO: common: created directory at: artifacts]
[2025-02-14 12:16:19,080: INFO: common: created directory at: artifacts/data_transformation]
[2025-02-14 12:16:19,091: INFO: 4076011688: Splited data into training and test sets]
[2025-02-14 12:16:19,092: INFO: 4076011688: (1199, 12)]
[2025-02-14 12:16:19,092: INFO: 4076011688: (400, 12)]
(1199, 12)
(400, 12)
