In [1]:
import sys
import os
from dataclasses import dataclass

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from src.exception import CustomException
from src.logger import logging
from src.utils import save_object

In [16]:
def get_data_transformer_object():
        try:
            numerical_columns = ['writing_score', 'reading_score']
            categorical_columns = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
            num_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='median')),
                    ('scaler', StandardScaler())
                ]
            )
            cat_pipeline = Pipeline(
                steps=[
                    ('imputer', SimpleImputer(strategy='most_frequent')),
                    ('one_hot_encoder', OneHotEncoder()),
                    ('scaler', StandardScaler(with_mean=False)),
                ]
            )
            logging.info("Numerical and categorical pipelines created")
            preprocessor=ColumnTransformer(
                transformers=[
                    ('num_pipeline', num_pipeline, numerical_columns),
                    ('cat_pipeline', cat_pipeline, categorical_columns)
                ]
            )
            return preprocessor
        except Exception as e:
            logging.info("An exception occurred in get_data_transformer_object")
            raise CustomException(e, sys)

In [6]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [19]:
preprocessing_obj = get_data_transformer_object()
print(preprocessing_obj)

ColumnTransformer(transformers=[('num_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['writing_score', 'reading_score']),
                                ('cat_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('one_hot_encoder',
                                                  OneHotEncoder()),
                                                 ('scaler',
                                                  StandardScaler(with_mean=False))]),
                                 ['gender', 'race_ethnicity',
                                  'parental_level_of_education', 'lunch',
                                 

In [23]:
target_column_name = "math_score"
numerical_columns = ["writing_score", "reading_score"]

input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
print(input_feature_train_df)
target_feature_train_df = train_df[target_column_name]

input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
target_feature_test_df = test_df[target_column_name]


     gender race_ethnicity parental_level_of_education         lunch  \
0    female        group D             master's degree      standard   
1    female        group C           bachelor's degree  free/reduced   
2    female        group D                some college  free/reduced   
3      male        group C             master's degree  free/reduced   
4      male        group E                 high school      standard   
..      ...            ...                         ...           ...   
795  female        group D             master's degree      standard   
796    male        group C           bachelor's degree      standard   
797  female        group C          associate's degree      standard   
798    male        group C                some college  free/reduced   
799  female        group D          associate's degree      standard   

    test_preparation_course  reading_score  writing_score  
0                      none             70             75  
1              

In [24]:
 input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)

In [27]:
print(input_feature_train_arr[0])

[0.43405338 0.03079054 2.00276196 0.         0.         0.
 0.         2.30177946 0.         0.         0.         0.
 4.25249263 0.         0.         0.         2.10183809 0.
 2.09830697]


In [29]:
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)
print(input_feature_test_arr)

[[ 1.03103224  1.13786619  2.00276196 ...  2.10183809  0.
   2.09830697]
 [ 0.30139141 -0.24597837  2.00276196 ...  0.          2.09830697
   0.        ]
 [ 0.23506043  0.23836723  0.         ...  2.10183809  0.
   2.09830697]
 ...
 [ 0.50038436  0.79190505  2.00276196 ...  0.          0.
   2.09830697]
 [ 0.36772239  0.51513614  2.00276196 ...  2.10183809  0.
   2.09830697]
 [ 0.23506043  0.30755945  2.00276196 ...  2.10183809  2.09830697
   0.        ]]


In [31]:
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
print(train_arr)

[[ 4.34053378e-01  3.07905414e-02  2.00276196e+00 ...  0.00000000e+00
   2.09830697e+00  6.20000000e+01]
 [ 9.64701252e-01  9.30289504e-01  2.00276196e+00 ...  2.09830697e+00
   0.00000000e+00  6.60000000e+01]
 [ 1.16369420e+00  1.34544287e+00  2.00276196e+00 ...  0.00000000e+00
   2.09830697e+00  7.90000000e+01]
 ...
 [-1.02522827e+00 -5.22747282e-01  2.00276196e+00 ...  0.00000000e+00
   2.09830697e+00  5.30000000e+01]
 [-1.02522827e+00 -1.49143847e+00  0.00000000e+00 ...  2.09830697e+00
   0.00000000e+00  5.00000000e+01]
 [ 1.36268716e+00  1.48382733e+00  2.00276196e+00 ...  0.00000000e+00
   2.09830697e+00  8.50000000e+01]]
