In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("../data/01_raw/dataset_id_106.csv")

In [3]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,y
0,<0,12,existing paid,radio/tv,1680.0,500<=X<1000,>=7,3,male mar/wid,none,...,1684.64598,0.449565,7.102188,0.008565,0.954307,0.449565,0.610219,0.85206,0.73881,True
1,no checking,12,critical/other existing credit,new car,682.0,100<=X<500,4<=X<7,4,female div/dep/mar,none,...,1332.604887,0.974432,9.574619,0.974432,0.857462,0.85541,0.859526,0.352279,0.212032,True
2,no checking,12,existing paid,furniture/equipment,1123.0,500<=X<1000,1<=X<4,4,female div/dep/mar,none,...,1529.809197,0.352279,3.120318,0.780444,0.899411,0.04774,0.137447,0.17633,0.036748,False
3,no checking,6,existing paid,new car,3518.0,<100,1<=X<4,2,male single,guarantor,...,1594.967273,0.04774,2.374474,0.510493,0.76708,0.484739,0.209614,0.418849,0.564009,False
4,0<=X<200,24,existing paid,furniture/equipment,3069.0,100<=X<500,>=7,4,male single,none,...,185.695469,0.510493,8.670796,0.391514,0.513463,0.696148,0.995817,0.607471,0.819747,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   checking_status         100 non-null    object 
 1   duration                100 non-null    int64  
 2   credit_history          100 non-null    object 
 3   purpose                 100 non-null    object 
 4   credit_amount           100 non-null    float64
 5   savings_status          100 non-null    object 
 6   employment              100 non-null    object 
 7   installment_commitment  100 non-null    int64  
 8   personal_status         100 non-null    object 
 9   other_parties           100 non-null    object 
 10  residence_since         100 non-null    int64  
 11  property_magnitude      100 non-null    object 
 12  age                     100 non-null    int64  
 13  other_payment_plans     100 non-null    object 
 14  housing                 100 non-null    obj

In [5]:
categorical_columns = df.select_dtypes(include=["object"]).columns
print(list(categorical_columns))

['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker', 'health_status']


In [6]:
for i in list(categorical_columns):
    print(df[i].value_counts())
    print("\n")

no checking    47
0<=X<200       28
<0             17
>=200           8
Name: checking_status, dtype: int64


existing paid                     55
critical/other existing credit    28
delayed previously                 9
all paid                           5
no credits/all paid                3
Name: credit_history, dtype: int64


radio/tv               34
new car                23
furniture/equipment    15
used car                8
business                8
education               5
domestic appliance      3
repairs                 2
other                   2
Name: purpose, dtype: int64


<100                52
no known savings    21
100<=X<500          16
500<=X<1000          6
>=1000               5
Name: savings_status, dtype: int64


1<=X<4        38
>=7           24
4<=X<7        20
<1            12
unemployed     6
Name: employment, dtype: int64


male single           57
female div/dep/mar    31
male mar/wid          11
male div/sep           1
Name: personal_status, dtype: int6

In [7]:
# Doing Label encoding for features with two categories

le = LabelEncoder()
df["own_telephone"] = le.fit_transform(df["own_telephone"])
df["foreign_worker"] = le.fit_transform(df["foreign_worker"])
df["health_status"] = le.fit_transform(df["health_status"])

df["y"] = le.fit_transform(df["y"])

1

In [9]:
# Doing Ordinal Encoding

df["savings_status"] = df["savings_status"].map(
    {"<100": 0, "no known savings": 1, "100<=X<500": 2, "500<=X<1000": 3, ">=1000": 4}
)

df["employment"] = df["employment"].map(
    {"1<=X<4": 0, ">=7": 1, "4<=X<7": 2, "<1": 3, "unemployed": 4}
)

df["checking_status"] = df["checking_status"].map(
    {"no checking": 0, "0<=X<200": 1, "<0": 2, ">=200": 3}
)

In [10]:
# Doing One hot encoding for features with multiple

df = pd.get_dummies(
    df,
    columns=[
        "purpose",
        "credit_history",
        "personal_status",
        "housing",
        "job",
        "other_payment_plans",
        "property_magnitude",
        "other_parties",
    ],
    drop_first=True,
)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 51 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   checking_status                                100 non-null    int64  
 1   duration                                       100 non-null    int64  
 2   credit_amount                                  100 non-null    float64
 3   savings_status                                 100 non-null    int64  
 4   employment                                     100 non-null    int64  
 5   installment_commitment                         100 non-null    int64  
 6   residence_since                                100 non-null    int64  
 7   age                                            100 non-null    int64  
 8   existing_credits                               100 non-null    int64  
 9   num_dependents                                 100 non-