In [1]:
import os 
import pandas as pd

In [2]:
%pwd

'e:\\ML_Projects_iNeuron\\iNeuron_Project_Census_data_Classification_With_MLflow\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'e:\\ML_Projects_iNeuron\\iNeuron_Project_Census_data_Classification_With_MLflow'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class Data_preprocessing_ValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict
    Preprocess_data: Path
    

In [6]:
from Mlflow_Ineuron_Project.constants import *
from Mlflow_Ineuron_Project.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_validation_config(self) -> Data_preprocessing_ValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])
        create_directories([config.preprocess_data_path])

        data_validation_config = Data_preprocessing_ValidationConfig(
            root_dir=config.root_dir,
            STATUS_FILE=config.STATUS_FILE,
            unzip_data_dir = config.unzip_data_dir,
            all_schema=schema,
            Preprocess_data=config.preprocess_data_path
            
        )

        return data_validation_config

In [8]:
import os
from Mlflow_Ineuron_Project import logger
from  sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import joblib


In [9]:
class Data_preprocessing_Validation:
    def __init__(self, config: Data_preprocessing_ValidationConfig):
        self.config = config


    def data_transformer_object(self):
        try:
            numerical_col=['age','fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

            categorical_col=['workclass', 'education', 'marital-status', 'occupation', 'sex', 'country']
            
       
            # pipeline for numerical columns
            # handling the 
            Numeric_pipeline=Pipeline(
                steps=[
                    ("imputer",SimpleImputer(strategy="mean"))
                ]
            )

            Categorical_pipeline=Pipeline(
               steps= [
                    ("imputer",SimpleImputer(strategy="most_frequent")), 
                    # ("lowercase", lowercase_transformer),  # Convert text to lowercase
                    ("onehot_encoder",OrdinalEncoder())
                    ]
            )

            # logging.info(f"categorical Columns: {categorical_col}")
            # logging.info("pipeline is created for the Column and numeric column transformation")


            preprocessor=ColumnTransformer(
                [
                    ("Numeric_pipeline",Numeric_pipeline,numerical_col),
                    ("Categorical_pipeline",Categorical_pipeline,categorical_col),
                    # ("target_pipeline",target_pipeline,target_col)

                ]
            )

            return preprocessor
        except Exception as e:
            raise e   

    def validate_all_columns(self)-> bool:
        try:
            validation_status = None

            data = pd.read_csv(self.config.unzip_data_dir)
       # remove the extra space around the columns 
            data.columns=data.columns.str.strip()
            categorical_col = ['workclass', 'education', 'marital-status', 'occupation', 'sex', 'country', 'salary']
            for col in categorical_col:
                if col in data.columns:
                    data[col] = data[col].str.strip()

            numerical_col=['age','fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

            categorical_col=['workclass', 'education', 'marital-status', 'occupation', 'sex', 'country','salary']
            # # for col in categorical_col:
            #     if col in data.columns:
            #         data[col] = data[col].str.lower()
            #     if col in data.columns:
            #         data[col] = data[col].str.lower() 
            # print(data.head())
            data.drop(columns=['relationship', 'race','education-num'], inplace=True)
            preprocessor_object=self.data_transformer_object()
            output=data
            output=preprocessor_object.fit_transform(data)
            # label encoding 
            for i in categorical_col:
                le=LabelEncoder()
                data[i]=le.fit_transform(data[i])
            print(data.head(1))

            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            print(all_schema)
            for col in all_cols:
                # print(col)
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status = True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")

            joblib.dump(preprocessor_object,os.path.join(self.config.Preprocess_data,"Preprocess_model.joblib"))
            data.to_csv(os.path.join(self.config.Preprocess_data,"preprocessed_data.csv"),index=False)

            return validation_status
        
        except Exception as e:
            raise e



In [10]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_validation_config()
    data_validation =Data_preprocessing_Validation(config=data_validation_config)
    data_validation.validate_all_columns()
except Exception as e:
    raise e

[2025-04-23 14:14:37,821: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-04-23 14:14:37,821: INFO: common: yaml file: params.yaml loaded successfully]
[2025-04-23 14:14:37,836: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-04-23 14:14:37,836: INFO: common: created directory at: artifacts]
[2025-04-23 14:14:37,844: INFO: common: created directory at: artifacts/data_validation]
[2025-04-23 14:14:37,847: INFO: common: created directory at: artifacts/data_preprocessed]
   age  workclass  fnlwgt  education  marital-status  occupation  sex  \
0   39          7   77516          9               4           1    1   

   capital-gain  capital-loss  hours-per-week  country  salary  
0          2174             0              40       39       0  
dict_keys(['age', 'workclass', 'fnlwgt', 'education', 'marital-status', 'occupation', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country', 'salary'])


In [11]:
from pathlib import Path
import pickle
model=joblib.load('artifacts\data_preprocessed\Preprocess_model.joblib')

In [None]:
# data1=pd.read_csv('E:\\ML_Projects_iNeuron\\iNeuron_Project_Census_data_Classification_With_MLflow\\artifacts\\data_ingestion\\Census_dataset.csv')
# data1.columns=data1.columns.str.strip()
# categorical_col=['workclass', 'education', 'marital-status', 'occupation', 'sex', 'country']
# data1.drop(columns=['relationship', 'race','education-num'], inplace=True)
# # for col in categorical_col:
# #     if col in data1.columns:
# #         data1[col] = data1[col].str.lower()
# #     if col in data1.columns:
# #         data1[col] = data1[col].str.lower() 
# # output=model.transform(data1.head())

In [None]:
# import pandas as pd
# data1=pd.read_csv('E:\\ML_Projects_iNeuron\\iNeuron_Project_Census_data_Classification_With_MLflow\\artifacts\\data_ingestion\\Census_dataset.csv')

# categorical_col = ['workclass', 'education', 'marital-status', 'occupation', 'sex', 'country', 'salary']
# for col in categorical_col:
#     if col in data1.columns:
#         data1[col] = data1[col].str.strip()

# print(data1.head(1))
# # print("Columns rearranged successfully!")

   age  workclass  fnlwgt  education  education-num marital-status  \
0   39  State-gov   77516  Bachelors             13  Never-married   

     occupation    relationship    race   sex  capital-gain  capital-loss  \
0  Adm-clerical   Not-in-family   White  Male          2174             0   

   hours-per-week        country salary  
0              40  United-States  <=50K  


In [None]:
# print(output)

[[3.90000e+01 7.75160e+04 2.17400e+03 0.00000e+00 4.00000e+01 7.00000e+00
  9.00000e+00 4.00000e+00 1.00000e+00 1.00000e+00 3.90000e+01]
 [5.00000e+01 8.33110e+04 0.00000e+00 0.00000e+00 1.30000e+01 6.00000e+00
  9.00000e+00 2.00000e+00 4.00000e+00 1.00000e+00 3.90000e+01]
 [3.80000e+01 2.15646e+05 0.00000e+00 0.00000e+00 4.00000e+01 4.00000e+00
  1.10000e+01 0.00000e+00 6.00000e+00 1.00000e+00 3.90000e+01]
 [5.30000e+01 2.34721e+05 0.00000e+00 0.00000e+00 4.00000e+01 4.00000e+00
  1.00000e+00 2.00000e+00 6.00000e+00 1.00000e+00 3.90000e+01]
 [2.80000e+01 3.38409e+05 0.00000e+00 0.00000e+00 4.00000e+01 4.00000e+00
  9.00000e+00 2.00000e+00 1.00000e+01 0.00000e+00 5.00000e+00]]


In [12]:
custom_data_input_dict = {
                "age": [self.age],
                "workclass": [self.workclass],
                "fnlwgt": [self.fnlwgt],
                "education": [self.education],
                "marital-status": [self.marital_status],
                "occupation": [self.occupation],
                "sex": [self.sex],
                "capital-gain": [self.capital_gain],
                "capital-loss": [self.capital_loss],
                "hours-per-week": [self.hours_per_week],
                "country": [self.country]
            }

desired_order = [
                'age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week',
                'workclass', 'education', 'marital-status', 'occupation', 'sex', 'country'
            ]

            # Reorder the dictionary
ordered_data = {key: custom_data_input_dict[key] for key in desired_order if key in custom_data_input_dict}

            # Convert to DataFrame if needed
           
df=pd.DataFrame(ordered_data)

NameError: name 'self' is not defined

In [None]:
df