In [214]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [215]:
df = pd.read_parquet(r"../data/types_fixed_german_credit_data.parquet", engine="pyarrow")

I will not select **Duration** due to it's correlation with credit amount.

In [216]:
selected_features = [
    "Credit amount",
    "Purpose",
    "Job",
    "Sex",
    "Saving accounts",
    "Housing",
    "Risk",
    "Age",
    "Unnamed: 0"  # this column contains indexes for duplicated data
]

df = df[selected_features]
df["Job"] = df["Job"].astype("Int64")  # don't know why Job is being picked up as float
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Credit amount    4568 non-null   Int64   
 1   Purpose          4686 non-null   category
 2   Job              4635 non-null   Int64   
 3   Sex              4617 non-null   category
 4   Saving accounts  3306 non-null   category
 5   Housing          4622 non-null   category
 6   Risk             4705 non-null   category
 7   Age              4703 non-null   Int64   
 8   Unnamed: 0       4685 non-null   float64 
dtypes: Int64(3), category(5), float64(1)
memory usage: 190.4 KB


Got to drop all rows where Risk is NAN. To stratify latter target var can not have missing values.

In [217]:
df = df.dropna(subset=["Risk"])

In [218]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4705 entries, 0 to 4845
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Credit amount    4488 non-null   Int64   
 1   Purpose          4625 non-null   category
 2   Job              4513 non-null   Int64   
 3   Sex              4502 non-null   category
 4   Saving accounts  3221 non-null   category
 5   Housing          4507 non-null   category
 6   Risk             4705 non-null   category
 7   Age              4576 non-null   Int64   
 8   Unnamed: 0       4573 non-null   float64 
dtypes: Int64(3), category(5), float64(1)
memory usage: 221.5 KB


As encountered in the previous EDA, housing, sex and purpose features had some weird, probably tampered, values in them, let's ditch those.

In [219]:
weird_categories = {
    "Housing" : ["356", "43", "563"],
    "Sex" : ["353546"],
    "Purpose" : ["3", "356", "56", "6"]
}

for col, values in weird_categories.items():
    df[col] = df[col].replace(values, np.nan)

In [220]:
df.isna().sum()

Credit amount       217
Purpose              85
Job                 192
Sex                 206
Saving accounts    1484
Housing             205
Risk                  0
Age                 129
Unnamed: 0          132
dtype: int64

In [221]:
df = df.drop_duplicates(subset=["Unnamed: 0"])
df.drop(columns=["Unnamed: 0"], inplace= True)

In [222]:
def remove_outliers_iqr(df: pd.DataFrame, threshold: float = 1.5):
    """Removes rows with outliers from numeric columns given a certain treshold.

    Args:
        df (pd.DataFrame): DataFrame to remove outliers from.
        threshold (float, optional): Treshold for removing minor (1.5) or extreme outliers (3.0). Defaults to 1.5.

    Returns:
        _type_: _description_
    """
    assert 1.5 <= threshold <= 3.0, AttributeError(f"Threshold attribute must be in [1.5, 3.0] range, got {threshold}")

    temp = df.copy()
    columns = df.select_dtypes(include=["number"]).columns
    
    for col in columns:
        q1, q3 = df[col].quantile(0.25), df[col].quantile(0.75)
        iqr = q3 - q1
        
        lower, upper = q1 - threshold * iqr, q3 + threshold * iqr
        
        temp = temp[
            (temp[col] >= lower) &
            (temp[col] <= upper)
        ] 
    
    return temp.reset_index(drop=True)

df = remove_outliers_iqr(df, threshold=3.0)

In [223]:
num_cols = ["Credit amount", "Age"]
cat_cols = ["Purpose", "Housing", "Sex"]
cat_ord_cols = ["Saving accounts", "Job"]

In [None]:
class OutliersRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, threshold: float):
        assert 1.5 <= threshold <= 3.0
        self.threshold = threshold
        self.feature_names_in_: list[str] = None
    
    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            try:
                X = pd.DataFrame(X)
            except Exception:
                raise ValueError("Couldn't convert input data to pandas DataFrame")
        self.feature_names_in_: np.ndarray = np.asarray(X.columns)
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        Q1, Q3 = X.quantile(0.25), X.quantile(0.75)
        IQR: float = Q3 - Q1
        mask: function = ~((X < (Q1 - self.threshold * IQR)) | (X > (Q3 + self.threshold * IQR))).any(axis=1)
        X.loc[mask] = np.nan
        return X
    
    def get_features_names_out(self, input_features=None) -> np.ndarray:
        return self.feature_names_in_


In [225]:
numeric_pipe = Pipeline(
    steps=[
        ("outlier_removal", OutliersRemoval(3.0)),
        ("imputer", KNNImputer(n_neighbors=3))
    ]
)

cat_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(drop="first"))
    ]
)

cat_ord_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OrdinalEncoder())
    ]
)

ordinal_transformers = [(f"{col}_ordinal", cat_ord_pipe, [col]) for col in cat_ord_cols]


preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipe, num_cols),
        ("categoric", cat_pipe, cat_cols),
    ] + ordinal_transformers
)

In [226]:
preprocessor

In [227]:
X_features = df.drop(columns=["Risk"])
Y_target = df["Risk"]#.map({"good": 1, "bad": 0})



x_train, x_test, y_train, y_test = train_test_split(
    X_features, Y_target, test_size=0.2, stratify= Y_target
)

In [228]:
preprocessed_data = preprocessor.fit(x_train)
feature_names = preprocessor.get_feature_names_out()
x_train_preprocessed = preprocessor.fit_transform(x_train)
x_train_preprocessed = pd.DataFrame(x_train_preprocessed, columns=feature_names)

AttributeError: Estimator outlier_removal does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?

In [None]:
x_train_preprocessed.head()

Unnamed: 0,numeric__Credit amount,numeric__Age,categoric__Purpose_car,categoric__Purpose_domestic appliances,categoric__Purpose_education,categoric__Purpose_furniture/equipment,categoric__Purpose_radio/TV,categoric__Purpose_repairs,categoric__Housing_own,categoric__Housing_rent,categoric__Sex_male,Saving accounts_ordinal__Saving accounts,Job_ordinal__Job
0,5248.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,3114.0,26.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1364.0,59.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1979.0,35.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4,3017.0,47.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [None]:
x_train.head()

Unnamed: 0,Credit amount,Purpose,Job,Sex,Saving accounts,Housing,Age
177,5248,car,2,male,,own,26
209,3114,furniture/equipment,2,female,little,rent,26
417,1364,radio/TV,2,male,little,own,59
539,1979,radio/TV,2,male,,own,35
211,3017,radio/TV,2,male,little,own,47
