In [38]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'matplotlib'

# Data Collection

In [2]:
# fetch dataset
data = fetch_ucirepo(id=144)

# extract data and labels 
X = data.data.features
y = data.data.targets

# extract other important information
metadata = data.metadata
variables = data.variables

In [3]:
# save data locally if not already done
df = pd.concat([X, y], axis=1)
if not os.path.exists("../data/raw_data.csv"):
    df.to_csv('../data/raw_data.csv', index=False)

# Data Preprocessing

We are working with the following attributes:

In [13]:
print(variables[["description", "units", "type"]])

                                          description   units         type
0                 Status of existing checking account    None  Categorical
1                                            Duration  months      Integer
2                                      Credit history    None  Categorical
3                                             Purpose    None  Categorical
4                                       Credit amount    None      Integer
5                               Savings account/bonds    None  Categorical
6                            Present employment since    None  Categorical
7   Installment rate in percentage of disposable i...    None      Integer
8                             Personal status and sex    None  Categorical
9                          Other debtors / guarantors    None  Categorical
10                            Present residence since    None      Integer
11                                           Property    None  Categorical
12                       

This provides valuable information about what the various attributes of the dataset represent and will help us determine how we will encode the various categorical variables:

In [27]:
print(metadata.additional_info.variable_info)

Attribute 1:  (qualitative)      
 Status of existing checking account
             A11 :      ... <    0 DM
	       A12 : 0 <= ... <  200 DM
	       A13 :      ... >= 200 DM / salary assignments for at least 1 year
               A14 : no checking account

Attribute 2:  (numerical)
	      Duration in month

Attribute 3:  (qualitative)
	      Credit history
	      A30 : no credits taken/ all credits paid back duly
              A31 : all credits at this bank paid back duly
	      A32 : existing credits paid back duly till now
              A33 : delay in paying off in the past
	      A34 : critical account/  other credits existing (not at this bank)

Attribute 4:  (qualitative)
	      Purpose
	      A40 : car (new)
	      A41 : car (used)
	      A42 : furniture/equipment
	      A43 : radio/television
	      A44 : domestic appliances
	      A45 : repairs
	      A46 : education
	      A47 : (vacation - does not exist?)
	      A48 : retraining
	      A49 : business
	      A410 : others

A

First, we will see if we need to address (remove or handle) outliers in the dataset. In order to do this, we will visualize the numerical attributes using boxplots and the categorical.

In [36]:
# numerical
numerical_attributes = variables[variables["type"] == "Integer"]["name"]
numerical_df = X[numerical_attributes]
print(numerical_df)

     Attribute2  Attribute5  Attribute8  Attribute11  Attribute13  \
0             6        1169           4            4           67   
1            48        5951           2            2           22   
2            12        2096           2            3           49   
3            42        7882           2            4           45   
4            24        4870           3            4           53   
..          ...         ...         ...          ...          ...   
995          12        1736           3            4           31   
996          30        3857           4            4           40   
997          12         804           4            4           38   
998          45        1845           4            4           23   
999          45        4576           3            4           27   

     Attribute16  Attribute18  
0              2            1  
1              1            1  
2              1            2  
3              1            2  
4          

For this project, we will be training 3 different models:
1. Linear Model - Logistic Regression
2. Tree-Based Model - Random Forest
3. Neural Network Model

We will preprocess our data differently based on the type of model being used. For our linear and neural network models, we will rely on one-hot encoding. For our tree-based model, we will rely on label encoding. However, in order to assess the performance of different models across different folds while performing k-fold cross validation, we need to ensure we are using the same data for each model as well as applying the appropriate encoding, normalization

In [16]:
for k in metadata.keys():
    print(k)

uci_id
name
repository_url
data_url
abstract
area
tasks
characteristics
num_instances
num_features
feature_types
demographics
target_col
index_col
has_missing_values
missing_values_symbol
year_of_dataset_creation
last_updated
dataset_doi
creators
intro_paper
additional_info


In [17]:
for k in metadata.keys():
    print(k)

uci_id
name
repository_url
data_url
abstract
area
tasks
characteristics
num_instances
num_features
feature_types
demographics
target_col
index_col
has_missing_values
missing_values_symbol
year_of_dataset_creation
last_updated
dataset_doi
creators
intro_paper
additional_info


## Data Cleaning
Cleaning: Handling missing values (imputation, removal), correcting inconsistencies (e.g., fixing typos), and ensuring the correct data types.

In [34]:
print(f"The data has {metadata.has_missing_values} missing values. Therefore, we will not have to impute or remove these values.")

The data has no missing values. Therefore, we will not have to impute or remove these values.


In [31]:
for k in metadata.keys():
    print(k.addotopma+)

uci_id
name
repository_url
data_url
abstract
area
tasks
characteristics
num_instances
num_features
feature_types
demographics
target_col
index_col
has_missing_values
missing_values_symbol
year_of_dataset_creation
last_updated
dataset_doi
creators
intro_paper
additional_info


In [25]:
print(type(variables))

<class 'pandas.core.frame.DataFrame'>


In [None]:
# metadata 
# print(statlog_german_credit_data.metadata)
  
# variable information 
# print(statlog_german_credit_data.variables) 

In [16]:
print(type(y))

<class 'pandas.core.frame.DataFrame'>


In [8]:
for k in data.metadata.keys():
    print(k)

uci_id
name
repository_url
data_url
abstract
area
tasks
characteristics
num_instances
num_features
feature_types
demographics
target_col
index_col
has_missing_values
missing_values_symbol
year_of_dataset_creation
last_updated
dataset_doi
creators
intro_paper
additional_info


In [9]:
for k in data.variables:
    print(k)

name
role
type
demographic
description
units
missing_values


In [13]:
print(data.variables.description)

0                   Status of existing checking account
1                                              Duration
2                                        Credit history
3                                               Purpose
4                                         Credit amount
5                                 Savings account/bonds
6                              Present employment since
7     Installment rate in percentage of disposable i...
8                               Personal status and sex
9                            Other debtors / guarantors
10                              Present residence since
11                                             Property
12                                                  Age
13                              Other installment plans
14                                              Housing
15              Number of existing credits at this bank
16                                                  Job
17    Number of people being liable to provide m