In [5]:
from housing.entity.config_entity import DataTransformationConfig 
from housing.entity.artifact_entity import DataIngestionArtifact,\
DataValidationArtifact,DataTransformationArtifact
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
from housing.constant import *
import yaml

In [2]:
import os
os.getcwd()

'C:\\Shubham\\Projects\\Personal\\ML_Project\\notebook'

In [3]:
os.chdir('C:\\Shubham\\Projects\\Personal\\ML_Project')

In [4]:
os.getcwd()

'C:\\Shubham\\Projects\\Personal\\ML_Project'

In [6]:
def read_yaml_file(file_path:str)->dict:
    
    with open(file_path, 'rb') as yaml_file:
        return yaml.safe_load(yaml_file)       

In [7]:
train_file_path = r'C:\Shubham\Projects\Personal\ML_Project\Dumy Dataset\loan_train.csv'
test_file_path = r'C:\Shubham\Projects\Personal\ML_Project\Dumy Dataset\loan_test.csv'

In [8]:
train_df = pd.read_csv(train_file_path)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14166 entries, 0 to 14165
Data columns (total 10 columns):
longitude             14166 non-null float64
latitude              14166 non-null float64
housing_median_age    14166 non-null float64
total_rooms           14166 non-null float64
total_bedrooms        14166 non-null float64
population            14166 non-null float64
households            14166 non-null float64
median_income         14166 non-null float64
median_house_value    14166 non-null float64
ocean_proximity       14166 non-null object
dtypes: float64(9), object(1)
memory usage: 1.1+ MB


In [9]:
schema_yaml_folder_name = 'config'
schema_yaml_file_name = 'schema.yaml'
schema_yaml_path = os.path.join(os.getcwd(),schema_yaml_folder_name,schema_yaml_file_name)
schema_yaml_path

'C:\\Shubham\\Projects\\Personal\\ML_Project\\config\\schema.yaml'

## Execution of Load Data Function

In [10]:
def load_data(file_path: str, schema_file_path: str) -> pd.DataFrame:
        datatset_schema = read_yaml_file(schema_file_path)

        schema = datatset_schema[DATASET_SCHEMA_COLUMNS_KEY]

        dataframe = pd.read_csv(file_path)

        error_messgae = ""


        for column in dataframe.columns:
            if column in list(schema.keys()):
                dataframe[column].astype(schema[column])
            else:
                error_messgae = f"{error_messgae} \nColumn: [{column}] is not in the schema."
                
        return dataframe

In [11]:
datatset_schema = read_yaml_file(schema_yaml_path)
datatset_schema

{'columns': {'longitude': 'float',
  'latitude': 'float',
  'housing_median_age': 'float',
  'total_rooms': 'float',
  'total_bedrooms': 'float',
  'population': 'float',
  'households': 'float',
  'median_income': 'float',
  'median_house_value': 'float',
  'ocean_proximity': 'category'},
 'numerical_columns': ['longitude',
  'latitude',
  'housing_median_age',
  'total_rooms',
  'total_bedrooms',
  'population',
  'households',
  'median_income'],
 'categorical_columns': ['ocean_proximity'],
 'target_column': 'median_house_value',
 'domain_value': {'ocean_proximity': ['<1H OCEAN',
   'INLAND',
   'ISLAND',
   'NEAR BAY',
   'NEAR OCEAN']}}

In [15]:
schema = datatset_schema[DATASET_SCHEMA_COLUMNS_KEY]
list(schema.keys())

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [15]:
DATASET_SCHEMA_COLUMNS_KEY

'columns'

In [16]:
for column in train_df.columns:
    print(column)

longitude
latitude
housing_median_age
total_rooms
total_bedrooms
population
households
median_income
median_house_value
ocean_proximity


In [17]:
list(schema.keys())

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'median_house_value',
 'ocean_proximity']

In [18]:
train_df = pd.read_csv(train_file_path)
train_df['longitude'].astype(int).head()

0   -121
1   -122
2   -118
3   -119
4   -117
Name: longitude, dtype: int32

In [19]:
new_train_df = load_data(train_file_path,schema_yaml_path)
new_test_df = load_data(test_file_path,schema_yaml_path)

## <<<<<<<End Data Load Execution

In [21]:
target_column_name = datatset_schema[TARGET_COLUMN_KEY]
target_column_name

'median_house_value'

### Splitting input and target feature of training and testing dataframe (data labelling into x and y)

In [22]:
input_feature_train_df = new_train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = new_train_df[target_column_name]

In [25]:
input_feature_test_df = new_test_df.drop(columns=[target_column_name],axis=1)
target_feature_test_df = new_test_df[target_column_name]

# Pipeline

In [29]:
numerical_col = datatset_schema[NUMERICAL_COLUMN_KEY]
categorical_col = datatset_schema[CATEGORICAL_COLUMN_KEY]

In [30]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler',StandardScaler())    
])

In [31]:
cat_pipeline = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoded',OneHotEncoder()),
    ('scaler',StandardScaler(with_mean=False))
])

In [32]:
processing = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_col),
    ('cat_pipeline',cat_pipeline,categorical_col)
    
])

In [37]:
def get_data_transformer_object():
    
    #Read schema.yaml file
    datatset_schema = read_yaml_file(schema_yaml_path)
    
    #create numeriacal col and cat col variable by using schema.yaml file
    numerical_col = datatset_schema[NUMERICAL_COLUMN_KEY]
    categorical_col = datatset_schema[CATEGORICAL_COLUMN_KEY]
    
    num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler',StandardScaler())    
    ])
    
    cat_pipeline = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoded',OneHotEncoder()),
    ('scaler',StandardScaler(with_mean=False))
    ])
    
    processing = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_col),
    ('cat_pipeline',cat_pipeline,categorical_col)
    ])
    
    return processing

## Applying preprocessing object on training dataframe and testing dataframe

In [38]:
preprocessing_obj = get_data_transformer_object()

input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

In [42]:
input_feature_train_arr.shape

(14166, 13)

In [44]:
input_feature_test_arr.shape

(6062, 13)

In [48]:
train_arr = np.c_[ input_feature_train_arr, np.array(target_feature_train_df)]

test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]

In [51]:
test_arr.shape

(6062, 14)

In [53]:
def save_numpy_array_data(file_path: str, array: np.array):
    """
    Save numpy array data to file
    file_path: str location of file to save
    array: np.array data to save
    """
    dir_path = os.path.dirname(file_path)
    os.makedirs(dir_path, exist_ok=True)
    with open(file_path, 'wb') as file_obj:
        np.save(file_obj, array)

In [64]:
def save_object(file_path:str,obj):
    """
    file_path: str
    obj: Any sort of object
    """
    dir_path = os.path.dirname(file_path)
    os.makedirs(dir_path, exist_ok=True)
    with open(file_path, "wb") as file_obj:
        dill.dump(obj, file_obj)