In [1]:
import pandas as pd
import numpy as np
import optuna

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin


In [2]:
train_df = pd.read_csv("processsing/train_table_merged.csv")

In [3]:
train_df.info(show_counts=True, verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 211 columns):
 #    Column                            Non-Null Count   Dtype  
---   ------                            --------------   -----  
 0    SK_ID_CURR                        307511 non-null  int64  
 1    TARGET                            307511 non-null  int64  
 2    NAME_CONTRACT_TYPE                307511 non-null  object 
 3    CODE_GENDER                       307511 non-null  object 
 4    FLAG_OWN_CAR                      307511 non-null  object 
 5    FLAG_OWN_REALTY                   307511 non-null  object 
 6    CNT_CHILDREN                      307511 non-null  int64  
 7    AMT_INCOME_TOTAL                  307511 non-null  float64
 8    AMT_CREDIT                        307511 non-null  float64
 9    AMT_ANNUITY                       307499 non-null  float64
 10   AMT_GOODS_PRICE                   307233 non-null  float64
 11   NAME_TYPE_SUITE                   306

## Building Pipeline of Preprocessing steps

In [5]:
class HighNanDropper(BaseEstimator, TransformerMixin):
    def __init__(self, nan_threshold=0.75):
        self.nan_threshold = nan_threshold
    
    def fit(self, X, y=None):
        nans = X.isna().mean()
        self.columns_to_drop_ = nans[nans >= self.nan_threshold].index
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns_to_drop_)



In [6]:
def create_impute_nan_transformer():
    numerical_transformer = SimpleImputer(strategy='median')
    categorical_transformer = SimpleImputer(strategy='constant', fill_value='MISSING')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, make_column_selector(dtype_include=np.number)),
            ('cat', categorical_transformer, make_column_selector(dtype_include=object))
        ])

    return preprocessor

In [7]:
class HandCraftedFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()

        #Computing Hand-crafted features

        #Compute credit load of client, credit sum in terms of income, consumption rate
        X_transformed["AMT_CREDIT_LOAD"] = X_transformed["AMT_INCOME_TOTAL"] / (X_transformed["AMT_ANNUITY"] + 0.001)
        X_transformed["AMT_CREDIT_BY_INCOME"] = X_transformed["AMT_CREDIT"] / (X_transformed["AMT_INCOME_TOTAL"] + 0.001)

        X_transformed["CONSUMPTION_RATE"] = X_transformed["AMT_GOODS_PRICE"] / (X_transformed["AMT_INCOME_TOTAL"] + 0.001)
        X_transformed["CONSUMPTION_RATE_CREDIT"] = X_transformed["AMT_GOODS_PRICE"] / (X_transformed["AMT_CREDIT"] + 0.001)
        
        
        return X_transformed

In [8]:
preprocess_pipeline = Pipeline(steps=[
    ("high_nan_dropper", HighNanDropper()),
    ("nan_imputer",  create_impute_nan_transformer()),
   # ("handcrafted_features_builder", HandCraftedFeatures())
    
])

In [9]:
preprocess_pipeline.fit_transform(train_df)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[100002.0, 1.0, 0.0, ..., 'Auto technology', 'low_normal',
        'POS other with interest'],
       [100003.0, 0.0, 0.0, ..., 'Consumer electronics', 'middle',
        'Cash X-Sell: low'],
       [100004.0, 0.0, 0.0, ..., 'Connectivity', 'middle',
        'POS mobile without interest'],
       ...,
       [456253.0, 0.0, 0.0, ..., 'Connectivity', 'high',
        'POS mobile with interest'],
       [456254.0, 1.0, 0.0, ..., 'Consumer electronics', 'high',
        'POS mobile with interest'],
       [456255.0, 0.0, 0.0, ..., 'XNA', 'middle', 'Cash X-Sell: middle']],
      dtype=object)