In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

In [2]:
df = pd.read_csv(r"..\data\raw_german_credit_data.csv")

In [3]:
df.sample(10)

Unnamed: 0.1,Duration,Checking account,Credit amount,Purpose,Job,Sex,data_source,Saving accounts,Housing,Risk,Age,Unnamed: 0
4815,21.0,little,3763,car,1.0,male,kaggle,,own,good,24,906.0
849,18.0,moderate,3590,business,0.0,male,kaggle,little,own,good,40,950.0
1018,15.0,,3029,car,2.0,male,kaggle,little,own,good,33,749.0
571,30.0,little,11998,repairs,1.0,male,kaggle,little,own,bad,34,274.0
2103,36.0,little,5493,car,2.0,male,kaggle,little,free,good,42,501.0
1636,15.0,,1979,radio/TV,2.0,male,kaggle,,own,good,35,625.0
4045,12.0,,1493,radio/TV,2.0,female,kaggle,little,own,good,34,638.0
3536,9.0,,1313,furniture/equipment,2.0,male,kaggle,little,own,good,20,428.0
3737,24.0,,1516,radio/TV,1.0,female,kaggle,rich,own,good,43,399.0
90,24.0,,5511,furniture/equipment,2.0,male,kaggle,moderate,own,good,25,311.0


In [4]:
nans = ['NA', 'N/A', 'null', 'NULL', 'nan', 'NaN', '', ' ', '?']
for col in df.columns:
    if len(df[df[col].isin(nans)]) > 0:
        print(f"Weird Nan representation found for column {col}")

No weird representation for Nan values found for columns

In [5]:
for col in df.columns:
    assert len(df[pd.isna(df[col])]) + len(df[df[col].notna()]) == len(df[col]), f"Nan + not nan assertion invalid for column {col}"

We can see that indeed all nan + non nan values match up to the column length.

I'll get rid of Checking account column, because it contains too much Nans in it. Think it's a good idea to get rid of Sex aswell to not bias the model.

Now let's get rid off duplicated rows, and then of the Unnamed 0 and data_source columns 

In [6]:
no_dups = (
    df
    .drop_duplicates(subset=["Unnamed: 0"], keep="first")
    .drop(columns=["Unnamed: 0", "Checking account", "data_source"])
)

In [7]:
no_dups.sample(5)

Unnamed: 0,Duration,Credit amount,Purpose,Job,Sex,Saving accounts,Housing,Risk,Age
421,27.0,5965,car,3.0,male,little,own,good,30
351,18.0,2124,furniture/equipment,2.0,female,little,rent,bad,24
134,24.0,11938,vacation/others,3.0,male,little,own,bad,39
519,21.0,1647,car,1.0,male,,own,bad,40
375,27.0,4526,furniture/equipment,1.0,male,rich,own,good,32


Now, let's fix the dtypes
|Column | Description | Type |
|-|-|-|
| Duration | Credit's estimated duration in Months | Numerical continuous |
| Credit amount | Required credit ammount | Numerical continuous |
| Purpose | What is the credit for | Categorical nominal |
| Job | 0 = unskilled and non-resident, 1 = unskilled and resident, 2 = skilled, 3 = highly skilled | Categorical ordinal |
| Sex | | Categorical nominal |
| Saving accounts | Level of savings of the applicant (e.g., little, quite rich, rich) | Categorical ordinal |
| Housing | Type of housing situation (e.g., rent, own, free) | Categorical nominal |
| Risk | bad = 0, 1 = good | Categorical binary |
| Age | Age of the credit requester | Numerical continuous |


First, let's handle categorical types

In [8]:
categorical_cols = {
    "Purpose": [], 
    "Job": [0, 1, 2, 3], 
    "Sex": [],
    "Saving accounts": ['little', 'quite rich', 'rich'], 
    "Housing": [],
    "Risk": ["good", "bad"] 
}

no_dups[list(categorical_cols.keys())] = no_dups[list(categorical_cols.keys())].astype("category")

for col, categories in categorical_cols.items():
    if len(categories) > 0:
        no_dups[col] = pd.Categorical(no_dups[col], categories=categories, ordered=True)

Now, let's handle numerical columns

In [9]:
numeric_cols = {
    "Duration" : "Int64",  # has to be base 64 so it can be nullable
    "Credit amount": "Int64",
    "Age": "Int64"
}

checking_df = no_dups.copy()
for col in numeric_cols.keys():
    checking_df[col] = checking_df[col].astype(str)  # convert to string temporarily for checking
    mask = ~checking_df[col].str.match(r'^-?\d+\.?\d*$', na=True)
    print(f"Invalid values found for column {col}:", no_dups[mask][col].unique())
    no_dups.loc[mask, col] = np.nan

no_dups = no_dups.astype({col: num_type for col, num_type in numeric_cols.items()})

Invalid values found for column Duration: [nan]
Invalid values found for column Credit amount: [nan 'dfas']
Invalid values found for column Age: [nan]


In [10]:
no_dups.info()

<class 'pandas.core.frame.DataFrame'>
Index: 966 entries, 0 to 1914
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Duration         939 non-null    Int64   
 1   Credit amount    928 non-null    Int64   
 2   Purpose          934 non-null    category
 3   Job              947 non-null    category
 4   Sex              937 non-null    category
 5   Saving accounts  667 non-null    category
 6   Housing          943 non-null    category
 7   Risk             936 non-null    category
 8   Age              958 non-null    Int64   
dtypes: Int64(3), category(6)
memory usage: 39.8 KB


Lets drop the index col

In [11]:
no_dups = no_dups.reset_index(drop=True)

In [12]:
schema = pa.Table.from_pandas(no_dups).schema

In [13]:
no_dups.to_parquet(
    r"..\data\types_fixed_german_credit_data.parquet", 
    index = False,
    schema = schema
)