## Data Preprocessing

In [1]:
import os
import requests
# import numpy as np
import pandas as pd
# from io import BytesIO
from scipy.io import arff
# from category_encoders import TargetEncoder, LeaveOneOutEncoder
# from sklearn.preprocessing import LabelEncoder, LabelBinarizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
# consts
TUNGUZ_DATASET_REPO_URL = 'https://raw.githubusercontent.com/tunguz/TabularBenchmarks/main/datasets/credit-g/input/'
ARFF_DATASET = 'dataset_31_credit-g.arff'

In [3]:
url = f'{TUNGUZ_DATASET_REPO_URL}{ARFF_DATASET}'

if os.path.exists(ARFF_DATASET):
    with open(ARFF_DATASET, 'rt') as f:
        data, meta = arff.loadarff(f)
else:
    response = requests.get(url)
    if response.status_code == 200:
        with open(ARFF_DATASET, 'wb') as f:
            f.write(response.content)
        # Reopen the file in text mode for reading with arff.loadarff
        with open(ARFF_DATASET, 'rt') as f:
            data, meta = arff.loadarff(f)
    else:
        print(f"Couldn't download the file: {url}")

In [4]:
data[:5]

array([(b'<0',  6., b'critical/other existing credit', b'radio/tv', 1169., b'no known savings', b'>=7', 4., b'male single', b'none', 4., b'real estate', 67., b'none', b'own', 2., b'skilled', 1., b'yes', b'yes', b'good'),
       (b'0<=X<200', 48., b'existing paid', b'radio/tv', 5951., b'<100', b'1<=X<4', 2., b'female div/dep/mar', b'none', 2., b'real estate', 22., b'none', b'own', 1., b'skilled', 1., b'none', b'yes', b'bad'),
       (b'no checking', 12., b'critical/other existing credit', b'education', 2096., b'<100', b'4<=X<7', 2., b'male single', b'none', 3., b'real estate', 49., b'none', b'own', 1., b'unskilled resident', 2., b'none', b'yes', b'good'),
       (b'<0', 42., b'existing paid', b'furniture/equipment', 7882., b'<100', b'4<=X<7', 2., b'male single', b'guarantor', 4., b'life insurance', 45., b'none', b'for free', 1., b'skilled', 2., b'none', b'yes', b'good'),
       (b'<0', 24., b'delayed previously', b'new car', 4870., b'<100', b'1<=X<4', 3., b'male single', b'none', 4., b'

In [5]:
meta

Dataset: german_credit
	checking_status's type is nominal, range is ('<0', '0<=X<200', '>=200', 'no checking')
	duration's type is numeric
	credit_history's type is nominal, range is ('no credits/all paid', 'all paid', 'existing paid', 'delayed previously', 'critical/other existing credit')
	purpose's type is nominal, range is ('new car', 'used car', 'furniture/equipment', 'radio/tv', 'domestic appliance', 'repairs', 'education', 'vacation', 'retraining', 'business', 'other')
	credit_amount's type is numeric
	savings_status's type is nominal, range is ('<100', '100<=X<500', '500<=X<1000', '>=1000', 'no known savings')
	employment's type is nominal, range is ('unemployed', '<1', '1<=X<4', '4<=X<7', '>=7')
	installment_commitment's type is numeric
	personal_status's type is nominal, range is ('male div/sep', 'female div/dep/mar', 'male single', 'male mar/wid', 'female single')
	other_parties's type is nominal, range is ('none', 'co applicant', 'guarantor')
	residence_since's type is nume

In [6]:
arff_data = pd.DataFrame(data)
arff_data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,b'<0',6.0,b'critical/other existing credit',b'radio/tv',1169.0,b'no known savings',b'>=7',4.0,b'male single',b'none',...,b'real estate',67.0,b'none',b'own',2.0,b'skilled',1.0,b'yes',b'yes',b'good'
1,b'0<=X<200',48.0,b'existing paid',b'radio/tv',5951.0,b'<100',b'1<=X<4',2.0,b'female div/dep/mar',b'none',...,b'real estate',22.0,b'none',b'own',1.0,b'skilled',1.0,b'none',b'yes',b'bad'
2,b'no checking',12.0,b'critical/other existing credit',b'education',2096.0,b'<100',b'4<=X<7',2.0,b'male single',b'none',...,b'real estate',49.0,b'none',b'own',1.0,b'unskilled resident',2.0,b'none',b'yes',b'good'
3,b'<0',42.0,b'existing paid',b'furniture/equipment',7882.0,b'<100',b'4<=X<7',2.0,b'male single',b'guarantor',...,b'life insurance',45.0,b'none',b'for free',1.0,b'skilled',2.0,b'none',b'yes',b'good'
4,b'<0',24.0,b'delayed previously',b'new car',4870.0,b'<100',b'1<=X<4',3.0,b'male single',b'none',...,b'no known property',53.0,b'none',b'for free',2.0,b'skilled',2.0,b'none',b'yes',b'bad'


In [35]:
for col in arff_data.columns:
    if len(arff_data[col].unique()) <= 50:
        print(arff_data[col].unique())
        print(f'{col} - dtype: {arff_data[col].dtype}', end='\n\n')
    else:
        print(arff_data[col].describe(), end='\n\n')

[b'<0' b'0<=X<200' b'no checking' b'>=200']
checking_status - dtype: object

[ 6. 48. 12. 42. 24. 36. 30. 15.  9. 10.  7. 60. 18. 45. 11. 27.  8. 54.
 20. 14. 33. 21. 16.  4. 47. 13. 22. 39. 28.  5. 26. 72. 40.]
duration - dtype: float64

[b'critical/other existing credit' b'existing paid' b'delayed previously'
 b'no credits/all paid' b'all paid']
credit_history - dtype: object

[b'radio/tv' b'education' b'furniture/equipment' b'new car' b'used car'
 b'business' b'domestic appliance' b'repairs' b'other' b'retraining']
purpose - dtype: object

count     1000.000000
mean      3271.258000
std       2822.736876
min        250.000000
25%       1365.500000
50%       2319.500000
75%       3972.250000
max      18424.000000
Name: credit_amount, dtype: float64

[b'no known savings' b'<100' b'500<=X<1000' b'>=1000' b'100<=X<500']
savings_status - dtype: object

[b'>=7' b'1<=X<4' b'4<=X<7' b'unemployed' b'<1']
employment - dtype: object

[4. 2. 3. 1.]
installment_commitment - dtype: float64

[b'ma

### Attributte Description

| attribute | description | nominal | ordinal | ratio |
| --- | --- | --- | --- | --- |
| `checking_status` |  Status of existing checking account. **People with lower balances might be seen as higher risk.** | ✔ | there are order | - |
| `duration` | Duration in month. **Longer loan durations might be riskier as they offer more opportunities for the borrower's circumstances to change.** | - | long or short | ✔ |
| `credit_history` | The applicant’s track record in terms of past loans. **A history of delinquency could point to higher risk.** | ✔ | - | - |
| `purpose` | Purpose for the loan. **Certain loan purposes might be associated with higher risk than others (e.g., starting a new business might be riskier than buying a car).** | ✔ | - | - |
| `credit_amount` | The amount of loan in question. **Larger loans might be considered higher risk.** | - | small or big | ✔ |
| `savings_status` | Savings account/bonds. **Those with more savings might be considered lower risk.** | ✔ | there are order | - |
| `employment` | Present employment since. **Longer employment might be seen as indicating more stable income.** | ✔ | there are order | - |
| `installment_commitment` | Installment rate in percentage of disposable income. **Higher percentages could indicate financial strain and higher risk.** | - | low or high | ✔ |
| `personal_status` | Personal status and sex includes marital status and gender. **These factors could have complex interactions with risk.** | ✔ | - | - |
| `other_parties` | Other debtors / guarantors that indicates if there are other people who share the responsibility of the loan. **Guarantors can decrease the risk.** | ✔ | - | - |
| `residence_since` | How long the applicant has lived at their current address. **Longer times could indicate more stability.** | - | long or short | ✔ |
| `property_magnitude` | Describes the types of property the person owns. **Describes the types of property the person owns.** | ✔ | - | - |
| `age` | The applicant’s age. **Risk can vary with age due to factors like income stability, health status, etc.** | - | older or younger | ✔ |
| `other_payment_plans` | Other installment plans that indicates if the borrower has other ongoing loans. **Additional financial obligations can increase risk.** | ✔ | - | - |
| `housing` | The type of housing the applicant lives in. **Those who own their homes might be seen as more stable/less risky.** | ✔ | - | - |
| `existing_credits` | Number of existing credits at this bank. **Multiple loans could indicate higher risk.** | - | few or many | ✔ |
| `job` | The applicant's job status and type. **Certain jobs might be seen as more stable/less risky.** | ✔ | - | - |
| `num_dependents` | Number of people being liable to provide maintenance for. **The more dependents person has, the more of their income is likely already spoken for, which can make it riskier for the bank to lend them money.** | - | - | ✔ |
| `own_telephone` | Indicates if the applicant has a telephone registered under their name. **This could be seen as a sign of stability.** | ✔ | - | - |
| `foreign_worker` | Indicates if the applicant is a foreign worker. **Foreign workers might be seen as higher risk due to potential job and legal uncertainties.** | ✔ | - | - |
| `class` | Label that indicates whether the applicant is good or bad for the credit. | ✔ | - | - |

In [7]:
arff_data.convert_dtypes()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,b'<0',6,b'critical/other existing credit',b'radio/tv',1169,b'no known savings',b'>=7',4,b'male single',b'none',...,b'real estate',67,b'none',b'own',2,b'skilled',1,b'yes',b'yes',b'good'
1,b'0<=X<200',48,b'existing paid',b'radio/tv',5951,b'<100',b'1<=X<4',2,b'female div/dep/mar',b'none',...,b'real estate',22,b'none',b'own',1,b'skilled',1,b'none',b'yes',b'bad'
2,b'no checking',12,b'critical/other existing credit',b'education',2096,b'<100',b'4<=X<7',2,b'male single',b'none',...,b'real estate',49,b'none',b'own',1,b'unskilled resident',2,b'none',b'yes',b'good'
3,b'<0',42,b'existing paid',b'furniture/equipment',7882,b'<100',b'4<=X<7',2,b'male single',b'guarantor',...,b'life insurance',45,b'none',b'for free',1,b'skilled',2,b'none',b'yes',b'good'
4,b'<0',24,b'delayed previously',b'new car',4870,b'<100',b'1<=X<4',3,b'male single',b'none',...,b'no known property',53,b'none',b'for free',2,b'skilled',2,b'none',b'yes',b'bad'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,b'no checking',12,b'existing paid',b'furniture/equipment',1736,b'<100',b'4<=X<7',3,b'female div/dep/mar',b'none',...,b'real estate',31,b'none',b'own',1,b'unskilled resident',1,b'none',b'yes',b'good'
996,b'<0',30,b'existing paid',b'used car',3857,b'<100',b'1<=X<4',4,b'male div/sep',b'none',...,b'life insurance',40,b'none',b'own',1,b'high qualif/self emp/mgmt',1,b'yes',b'yes',b'good'
997,b'no checking',12,b'existing paid',b'radio/tv',804,b'<100',b'>=7',4,b'male single',b'none',...,b'car',38,b'none',b'own',1,b'skilled',1,b'none',b'yes',b'good'
998,b'<0',45,b'existing paid',b'radio/tv',1845,b'<100',b'1<=X<4',4,b'male single',b'none',...,b'no known property',23,b'none',b'for free',1,b'skilled',1,b'yes',b'yes',b'bad'


In [9]:
arff_data.iloc[0]

checking_status                                       b'<0'
duration                                                6.0
credit_history            b'critical/other existing credit'
purpose                                         b'radio/tv'
credit_amount                                        1169.0
savings_status                          b'no known savings'
employment                                           b'>=7'
installment_commitment                                  4.0
personal_status                              b'male single'
other_parties                                       b'none'
residence_since                                         4.0
property_magnitude                           b'real estate'
age                                                    67.0
other_payment_plans                                 b'none'
housing                                              b'own'
existing_credits                                        2.0
job                                     

In [33]:
arff_data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,b'<0',6.0,b'critical/other existing credit',b'radio/tv',1169.0,b'no known savings',b'>=7',4.0,b'male single',b'none',...,b'real estate',67.0,b'none',b'own',2.0,b'skilled',1.0,b'yes',b'yes',b'good'
1,b'0<=X<200',48.0,b'existing paid',b'radio/tv',5951.0,b'<100',b'1<=X<4',2.0,b'female div/dep/mar',b'none',...,b'real estate',22.0,b'none',b'own',1.0,b'skilled',1.0,b'none',b'yes',b'bad'
2,b'no checking',12.0,b'critical/other existing credit',b'education',2096.0,b'<100',b'4<=X<7',2.0,b'male single',b'none',...,b'real estate',49.0,b'none',b'own',1.0,b'unskilled resident',2.0,b'none',b'yes',b'good'
3,b'<0',42.0,b'existing paid',b'furniture/equipment',7882.0,b'<100',b'4<=X<7',2.0,b'male single',b'guarantor',...,b'life insurance',45.0,b'none',b'for free',1.0,b'skilled',2.0,b'none',b'yes',b'good'
4,b'<0',24.0,b'delayed previously',b'new car',4870.0,b'<100',b'1<=X<4',3.0,b'male single',b'none',...,b'no known property',53.0,b'none',b'for free',2.0,b'skilled',2.0,b'none',b'yes',b'bad'


## Exploratory Data Analysis (EDA)

## Feature Engineering

## Modelling

## Evaluation 

## Future Works

1. Although we utilized the original .arrf dataset which contains both numerical and categorical data for our analysis, we have only explored feature engineering based on this version. In future works, it could be beneficial to compare the results with other versions of the dataset, such as the numerical version.

1. As discussed in the [Attribute Description](#attribute-description), determining the optimal combination of features is crucial. The way we perceive data - whether it's nominal, ordinal, or ratio - can significantly affect the quality of features for model training. Therefore, exploring various feature combinations to improve model performance will be an interesting future direction.

1. Our current work primarily focuses on:
    - **Descriptive Statistics**: We provided summary statistics including means, medians, ranges, and standard deviations, and visualized the data.
    - **Exploratory Data Analysis (EDA)**: We generated and answered our own questions to better understand the dataset.
    - **Predictive Analysis**: We constructed a model to predict future outcomes, in this case, the potential future applicants. 

1. However, there are several important areas we have not yet addressed:
    - **Inferential Statistics**: This involves the use of statistical tests to draw conclusions about a larger population based on a sample. For example, we could use a t-test to see if the means of two groups are significantly different.
    - **Hypothesis Testing**: We haven't tested any assumptions about specific population parameters.
    - **Prescriptive Analysis**: While our predictive analysis focuses on forecasting future outcomes, prescriptive analysis suggests actions that could be taken to benefit from these predictions. This could be a valuable addition to our analysis in the future.