In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import pickle

In [2]:
df=pd.read_csv('german_credit_data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad
999,999,27,male,2,own,moderate,moderate,4576,45,car,good


In [5]:
df['Risk'].value_counts()

Risk
good    700
bad     300
Name: count, dtype: int64

In [6]:
df.isna().sum()

Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [7]:
df.fillna('Unknown',inplace=True)

In [8]:
df.isna().sum()

Unnamed: 0          0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
Risk                0
dtype: int64

In [9]:
df.shape

(1000, 11)

In [10]:
for i in df.columns:
    print(f'No of unique value in {i}: {df[i].nunique()}')

No of unique value in Unnamed: 0: 1000
No of unique value in Age: 53
No of unique value in Sex: 2
No of unique value in Job: 4
No of unique value in Housing: 3
No of unique value in Saving accounts: 5
No of unique value in Checking account: 4
No of unique value in Credit amount: 921
No of unique value in Duration: 33
No of unique value in Purpose: 8
No of unique value in Risk: 2


In [11]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [12]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   1000 non-null   object
 5   Checking account  1000 non-null   object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB


In [14]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,Unknown,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,Unknown,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [15]:
x=df.drop(columns=['Risk'])
y=df['Risk']

In [16]:
le_y=LabelEncoder()
y_encoded=le_y.fit_transform(y)

In [17]:
x_trian,x_test,y_train,y_test=train_test_split(x,y_encoded ,test_size=0.3,random_state=100)

In [18]:
numeric_col=df.select_dtypes(include=['number']).drop(columns=['Job']).columns
ohe_col=['Purpose']
Ord_col=df.select_dtypes(include=['category','object']).drop(columns=['Risk']).columns
print(numeric_col,ohe_col,Ord_col)

Index(['Age', 'Credit amount', 'Duration'], dtype='object') ['Purpose'] Index(['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'], dtype='object')


In [19]:
preprocessor=ColumnTransformer([
    ('num', StandardScaler(), numeric_col),
    ('Onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False), ohe_col),
    ('Label', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),Ord_col)
])

In [20]:
log_reg_pipeline=Pipeline([
    ('Preprocessor', preprocessor),
    ('pca', PCA(n_components=5)),
    ('classifier', LogisticRegression())
])

In [21]:
log_reg_pipeline.fit(x_trian,y_train)

0,1,2
,steps,"[('Preprocessor', ...), ('pca', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('Onehot', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_components,5
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [22]:
log_reg_pipeline.score(x_trian,y_train)

0.7128571428571429

In [23]:
y_pred=log_reg_pipeline.predict(x_test)

In [24]:
metrics.accuracy_score(y_test,y_pred)

0.6833333333333333

In [25]:
pipeline_dict={
    'Pipeline': log_reg_pipeline,
    'Label_Encoder_y':le_y
}

In [26]:
with open('logreg_pipeline.pkl','wb') as f:
    pickle.dump(pipeline_dict,f)
    

In [27]:
%%writefile German_credit_risk_analysis.py
import numpy as np
import pandas as pd
import streamlit as st
import pickle

@st.cache_resource
def load_model():
    with open('logreg_pipeline.pkl','rb') as f:
        pipeline_dict = pickle.load(f)
    return pipeline_dict['Pipeline'], pipeline_dict['Label_Encoder_y']

pipeline, le_y= load_model()

st.title('German Credit Risk Prediction')
st.markdown("Enter customer details to predict credit risk")

job_options = {
    "unskilled and non-resident": 0,
    "unskilled and resident": 1,
    "skilled": 2,
    "highly skilled": 3
    }

#Create input columns
col1, col2 = st.columns(2)
with col1:
    age_input=st.number_input("Borrower Age", min_value=18, max_value=100, value=30)
    sex_input=st.selectbox("Gender", options=['Male','Female'])

    job_display = st.selectbox("Job Level", options=list(job_options.keys()), index=2)
    job_input = job_options[job_display]

    housing_input = st.selectbox("Housing Status", options=['free','own', 'rent'])
    saving_input = st.selectbox("Saving accounts", options=['Unkown','little', 'moderate', 'quite rich', 'rich'])

with col2:
    with col2:
        checking_input = st.selectbox("Checking account", options=['little', 'moderate', 'rich', 'no data'])
        creditamt_input = st.number_input("Loan Amount", min_value=250, max_value=18000, value=300)
        duration_input = st.slider("Loan Duration (months)", min_value=1, max_value=72, value=12, step=1)
        
    purpose_input = st.selectbox("Purpose", 
                                 options=['car (new)', 'car (used)', 'furniture/equipment',
                                          'radio/TV', 'domestic appliances', 'repairs', 'education',
                                          'vacation/others', 'retraining', 'business'])


input_data = {
    'Age': [age_input],
    'Sex': [sex_input],
    'Job': [job_input],
    'Housing': [housing_input],
    'Saving accounts': [saving_input],
    'Checking account': [checking_input],
    'Credit amount': [creditamt_input],
    'Duration': [duration_input],
    'Purpose': [purpose_input]
    }

input_df = pd.DataFrame(input_data)

if st.button("Predict Risk", type="primary"):
    try:
        # Predict using the pipeline
        prediction = pipeline.predict(input_df)[0]
        probability = pipeline.predict_proba(input_df)[0]
        
        # Decode prediction
        risk_label = le_y.inverse_transform([prediction])[0]
        
        st.success(f"**Predicted Risk: {risk_label.upper()}**")
        
        col_prob1, col_prob2 = st.columns(2)
        with col_prob1:
            good_idx = le_y.transform(['good'])[0]
            st.metric("Good Credit", f"{probability[good_idx]:.1%}")
        with col_prob2:
            bad_idx = le_y.transform(['bad'])[0]
            st.metric("Bad Credit", f"{probability[bad_idx]:.1%}")
            
    
            
    except Exception as e:
        st.error(f"Prediction failed: {str(e)}")
        st.info("Check if input data matches the training data format")



  

Writing German_credit_risk_analysis.py
