In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa

In [2]:
df = pd.read_csv(r"..\data\raw_german_credit_data.csv")

In [3]:
df.sample(10)

Unnamed: 0.1,Duration,Checking account,Credit amount,Purpose,Job,Sex,data_source,Saving accounts,Housing,Risk,Age,Unnamed: 0
2101,18.0,,1800,radio/TV,2.0,male,kaggle,little,own,good,24,698.0
4657,12.0,,976,radio/TV,2.0,male,kaggle,,own,good,35,803.0
1426,6.0,,1740,radio/TV,2.0,male,kaggle,little,rent,good,30,960.0
3711,36.0,moderate,2323,radio/TV,2.0,male,kaggle,little,rent,good,24,101.0
3928,48.0,,6110,education,2.0,male,kaggle,little,free,good,31,36.0
495,10.0,,1597,car,1.0,male,kaggle,quite rich,,good,40,413.0
2238,9.0,moderate,1501,education,3.0,female,kaggle,little,own,bad,34,195.0
87,10.0,,1597,car,1.0,male,kaggle,quite rich,,good,40,413.0
1754,10.0,,1546,car,1.0,male,kaggle,little,own,good,31,
1305,36.0,,4686,car,3.0,male,kaggle,little,free,good,32,511.0


In [4]:
nans = ["NA", "N/A", "null", "NULL", "nan", "NaN", "", " ", "?"]
for col in df.columns:
    if len(df[df[col].isin(nans)]) > 0:
        print(f"Weird Nan representation found for column {col}")

No weird representation for Nan values found for columns

In [5]:
for col in df.columns:
    assert len(df[pd.isna(df[col])]) + len(df[df[col].notna()]) == len(df[col]), (
        f"Nan + not nan assertion invalid for column {col}"
    )

We can see that indeed all nan + non nan values match up to the column length.

I'll get rid of Checking account column, because it contains too much Nans in it.

Now let's get rid off data_source column, due to it only contains the value kaggle in it. 

In [6]:
df.drop(columns=["data_source", "Checking account"], inplace=True)

Now, let's fix the dtypes
|Column | Description | Type |
|-|-|-|
| Duration | Credit's estimated duration in Months | Numerical continuous |
| Credit amount | Required credit ammount | Numerical continuous |
| Purpose | What is the credit for | Categorical nominal |
| Job | 0 = unskilled and non-resident, 1 = unskilled and resident, 2 = skilled, 3 = highly skilled | Categorical ordinal |
| Sex | | Categorical nominal |
| Saving accounts | Level of savings of the applicant (e.g., little, quite rich, rich) | Categorical ordinal |
| Housing | Type of housing situation (e.g., rent, own, free) | Categorical nominal |
| Risk | bad = 0, 1 = good | Categorical binary |
| Age | Age of the credit requester | Numerical continuous |


First, let's handle categorical types

In [7]:
categorical_cols = {
    "Purpose": [],
    "Job": [0, 1, 2, 3],
    "Sex": [],
    "Saving accounts": ["little", "quite rich", "rich"],
    "Housing": [],
    "Risk": ["good", "bad"],
}

df[list(categorical_cols.keys())] = df[list(categorical_cols.keys())].astype("category")

for col, categories in categorical_cols.items():
    if len(categories) > 0:
        df[col] = pd.Categorical(df[col], categories=categories, ordered=True)

Now, let's handle numerical columns

In [None]:
numeric_cols = {
    "Duration": "Int64",  # has to be base 64 so it can be nullable
    "Credit amount": "Int64",
    "Age": "Int64",
}

checking_df = df.copy()
for col in numeric_cols:
    checking_df[col] = checking_df[col].astype(
        str
    )  # convert to string temporarily for checking
    mask = ~checking_df[col].str.match(
        r"^-?\d+\.?\d*$", na=True
    )  # detecting weird values inside numerical columns
    print(f"Invalid values found for column {col}:", df[mask][col].unique())
    df.loc[mask, col] = np.nan

df = df.astype({col: num_type for col, num_type in numeric_cols.items()})

Invalid values found for column Duration: [nan]
Invalid values found for column Credit amount: [nan 'dfas' 'qwretryet' 'ttqweyuet']
Invalid values found for column Age: [nan 'hgd']


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Duration         4699 non-null   Int64   
 1   Credit amount    4568 non-null   Int64   
 2   Purpose          4686 non-null   category
 3   Job              4635 non-null   category
 4   Sex              4617 non-null   category
 5   Saving accounts  3306 non-null   category
 6   Housing          4622 non-null   category
 7   Risk             4705 non-null   category
 8   Age              4703 non-null   Int64   
 9   Unnamed: 0       4685 non-null   float64 
dtypes: Int64(3), category(6), float64(1)
memory usage: 195.3 KB


In [10]:
schema = pa.Table.from_pandas(df).schema

In [None]:
df.to_parquet(
    r"..\data\types_fixed_german_credit_data.parquet", index=False, schema=schema
)