In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [2]:
train_df = pd.read_csv("processsing/train_table_merged.csv")
test_df = pd.read_csv("processsing/test_table_merged.csv")


## Dropping cols with high NAN rate

In [3]:
COLS_TO_DROP_HIGH_NAN = ['OWN_CAR_AGE', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'FLOORSMIN_AVG',
       'LIVINGAPARTMENTS_AVG', 'NONLIVINGAPARTMENTS_AVG', 'YEARS_BUILD_MODE',
       'COMMONAREA_MODE', 'FLOORSMIN_MODE', 'LIVINGAPARTMENTS_MODE',
       'NONLIVINGAPARTMENTS_MODE', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI',
       'FLOORSMIN_MEDI', 'LIVINGAPARTMENTS_MEDI', 'NONLIVINGAPARTMENTS_MEDI',
       'FONDKAPREMONT_MODE', 'meanMONTHS_BALANCE_mean',
       'STD_MONTHS_BALANCE_mean', 'lastSTATUS_mode', 'modeSTATUS_mode',
       'RATE_INTEREST_PRIMARY_mean', 'RATE_INTEREST_PRIVILEGED_mean',
       'MONTHS_BALANCE_mean', 'AMT_BALANCE_mean',
       'AMT_CREDIT_LIMIT_ACTUAL_mean', 'AMT_DRAWINGS_ATM_CURRENT_mean',
       'AMT_DRAWINGS_CURRENT_mean', 'AMT_DRAWINGS_OTHER_CURRENT_mean',
       'AMT_DRAWINGS_POS_CURRENT_mean', 'AMT_INST_MIN_REGULARITY_mean',
       'AMT_PAYMENT_CURRENT_mean', 'AMT_PAYMENT_TOTAL_CURRENT_mean',
       'AMT_RECEIVABLE_PRINCIPAL_mean', 'AMT_RECIVABLE_mean',
       'AMT_TOTAL_RECEIVABLE_mean', 'CNT_DRAWINGS_ATM_CURRENT_mean',
       'CNT_DRAWINGS_CURRENT_mean', 'CNT_DRAWINGS_OTHER_CURRENT_mean',
       'CNT_DRAWINGS_POS_CURRENT_mean', 'CNT_INSTALMENT_MATURE_CUM_mean',
       'SK_DPD_mean', 'SK_DPD_DEF_mean']


train_df.drop(COLS_TO_DROP_HIGH_NAN, axis=1, inplace=True)
test_df.drop(COLS_TO_DROP_HIGH_NAN, axis=1, inplace=True)

## Handling missing values

In [4]:


class MissingImputer():
    def __init__(self):
        self.categorical_imputer = SimpleImputer(strategy="constant", fill_value="MISSING")
        self.numerical_imputer = SimpleImputer(strategy="median")
    
        self.categorical_features = []
        self.numerical_features = []
        self.categorical_dtypes = {}
        
        self.numerical_dtypes = {}

    def fit(self, df):
        self.categorical_features = df.select_dtypes(["object"]).columns
        self.numerical_features = df.select_dtypes(["int", "float"]).columns

        # Store original dtypes
        self.categorical_dtypes = {col: df[col].dtype for col in self.categorical_features}
        self.numerical_dtypes = {col: df[col].dtype for col in self.numerical_features}

        self.categorical_imputer.fit(df[self.categorical_features])
        self.numerical_imputer.fit(df[self.numerical_features])
        self.columns = list(self.numerical_features) + list(self.categorical_features)

    def transform(self, df):
        cat_df_imputed = self.categorical_imputer.transform(df[self.categorical_features])
        num_df_imputed = self.numerical_imputer.transform(df[self.numerical_features])

        # Convert back to DataFrame with original dtypes
        cat_df_imputed = pd.DataFrame(cat_df_imputed, columns=self.categorical_features)
        cat_df_imputed = cat_df_imputed.astype(self.categorical_dtypes)

        num_df_imputed = pd.DataFrame(num_df_imputed, columns=self.numerical_features)
        num_df_imputed = num_df_imputed.astype(self.numerical_dtypes)

        # Concatenate DataFrames while preserving dtypes
        final_df = pd.concat([num_df_imputed, cat_df_imputed], axis=1)

        return final_df

In [5]:


imputer = MissingImputer()
imputer.fit(train_df.drop("TARGET", axis=1))

In [6]:
train_df_imputed = imputer.transform(train_df.drop("TARGET", axis=1))
train_df_imputed["TARGET"] = train_df["TARGET"]

In [7]:
test_df_imputed = imputer.transform(test_df)

## Encoding categorical features

In [31]:

class CategoricalEncoder():
    def __init__(self):
        self.encoder = None
        self.cat_features = []

    def fit(self, df):
        self.cat_features = df.select_dtypes(include=["object"]).columns
        self.encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.encoder.fit(df[self.cat_features])

    def transform(self, df):
        transformed_df = df.copy()
        transformed_df[self.cat_features] = self.encoder.transform(df[self.cat_features])
        return transformed_df



In [27]:
cat_encoder = CategoricalEncoder()
cat_encoder.fit(train_df_imputed)

In [28]:
train_df_enc = cat_encoder.transform(train_df_imputed)

In [29]:
test_df_enc = cat_encoder.transform(test_df_imputed)

In [34]:
#Saving preprocessed data
train_df_enc.to_csv("processsing/processed_train.csv")
test_df_enc.to_csv("processsing/processed_test.csv")