In [1]:
# Analysis -> feature engineering -> model -> web app

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# setting display settings
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

In [3]:
df = pd.read_csv('german_credit_data.csv')

In [6]:
# Check if csv loaded successfully
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [5]:
df["Age"].describe()

count    1000.000000
mean       35.546000
std        11.375469
min        19.000000
25%        27.000000
50%        33.000000
75%        42.000000
max        75.000000
Name: Age, dtype: float64

In [9]:
df["Risk"].value_counts() # potential class imbalance when going to do the modelling

Risk
good    700
bad     300
Name: count, dtype: int64

In [10]:
# dropping rows with NaN values in Saving accounts & Checking account

In [11]:
df.shape

(1000, 11)

In [13]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Unnamed: 0        1000 non-null   int64
 1   Age               1000 non-null   int64
 2   Sex               1000 non-null   str  
 3   Job               1000 non-null   int64
 4   Housing           1000 non-null   str  
 5   Saving accounts   817 non-null    str  
 6   Checking account  606 non-null    str  
 7   Credit amount     1000 non-null   int64
 8   Duration          1000 non-null   int64
 9   Purpose           1000 non-null   str  
 10  Risk              1000 non-null   str  
dtypes: int64(5), str(6)
memory usage: 86.1 KB


In [14]:
df.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Unnamed: 0,1000.0,,,,499.5,288.819436,0.0,249.75,499.5,749.25,999.0
Age,1000.0,,,,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
Sex,1000.0,2.0,male,690.0,,,,,,,
Job,1000.0,,,,1.904,0.653614,0.0,2.0,2.0,2.0,3.0
Housing,1000.0,3.0,own,713.0,,,,,,,
Saving accounts,817.0,4.0,little,603.0,,,,,,,
Checking account,606.0,3.0,little,274.0,,,,,,,
Credit amount,1000.0,,,,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
Duration,1000.0,,,,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
Purpose,1000.0,8.0,car,337.0,,,,,,,


In [15]:
df["Job"].unique()

array([2, 1, 3, 0])

In [19]:
# Checking for missing values
df.isna().sum()

Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [20]:
# Checking for duplicated rows
df.duplicated().sum()

np.int64(0)

In [21]:
# Dropping missing values 
df = df.dropna().reset_index(drop=True)

In [22]:
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
1,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
2,4,53,male,2,free,little,little,4870,24,car,bad
3,7,35,male,3,rent,little,moderate,6948,36,car,good
4,9,28,male,3,own,little,moderate,5234,30,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
517,989,48,male,1,own,little,moderate,1743,24,radio/TV,good
518,993,30,male,3,own,little,little,3959,36,furniture/equipment,good
519,996,40,male,3,own,little,little,3857,30,car,good
520,998,23,male,2,free,little,little,1845,45,radio/TV,bad
