In [21]:
import pandas as pd

# Load the dataset into a Pandas DataFrame using the absolute path
df = pd.read_csv('/content/german_credit_data.csv')

# View the first 5 rows to understand what the data looks like
display(df.head())
df.info()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1000 non-null   int64 
 1   Age               1000 non-null   int64 
 2   Sex               1000 non-null   object
 3   Job               1000 non-null   int64 
 4   Housing           1000 non-null   object
 5   Saving accounts   817 non-null    object
 6   Checking account  606 non-null    object
 7   Credit amount     1000 non-null   int64 
 8   Duration          1000 non-null   int64 
 9   Purpose           1000 non-null   object
 10  Risk              1000 non-null   object
dtypes: int64(5), object(6)
memory usage: 86.1+ KB


In [22]:
if 'Unnamed: 0' in df.columns:
    df.drop('Unnamed: 0', axis=1, inplace=True)

# 3. FIX: Replace the literal string 'NA' (and true NaNs just in case) with 'unknown'
df['Saving accounts'] = df['Saving accounts'].replace('NA', 'unknown').fillna('unknown')
df['Checking account'] = df['Checking account'].replace('NA', 'unknown').fillna('unknown')

# 4. Check the results
print(df['Saving accounts'].value_counts())
print("-" * 20)
print(df['Checking account'].value_counts())
df.info()


Saving accounts
little        603
unknown       183
moderate      103
quite rich     63
rich           48
Name: count, dtype: int64
--------------------
Checking account
unknown     394
little      274
moderate    269
rich         63
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   1000 non-null   object
 5   Checking account  1000 non-null   object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# 1. Convert Target to Numbers
# We want to catch the "bad" loans, so we make 'bad' = 1 and 'good' = 0
df['Risk'] = df['Risk'].map({'bad': 1, 'good': 0})

# Separate features (X) from target (y)
y = df['Risk']
X = df.drop('Risk', axis=1)

# 2. Split into Training (80%) and Testing (20%)
# stratify=y ensures we keep the same ratio of good/bad loans in both sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Define the columns for the Pipeline
numeric_features = ['Age', 'Credit amount', 'Duration']
categorical_features = ['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']

# 4. Build the Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features), # Scales numbers
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features) # Converts text to 1s and 0s
    ])

print("Data split into training and testing sets!")
print(f"Training data shape: {X_train.shape}")
print("Preprocessor pipeline is ready to go.")

Data split into training and testing sets!
Training data shape: (800, 9)
Preprocessor pipeline is ready to go.


In [24]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. Create the full ML Pipelines
# These link your preprocessor directly to the algorithms
log_reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

# 2. Train the Models
print("Training Logistic Regression...")
log_reg_pipeline.fit(X_train, y_train)

print("Training Random Forest...")
rf_pipeline.fit(X_train, y_train)

# 3. Make Predictions on the Test Data
log_reg_predictions = log_reg_pipeline.predict(X_test)
rf_predictions = rf_pipeline.predict(X_test)

# 4. Evaluate and Compare
print("\n" + "="*40)
print("LOGISTIC REGRESSION RESULTS")
print("="*40)
print(classification_report(y_test, log_reg_predictions))

print("\n" + "="*40)
print("RANDOM FOREST RESULTS")
print("="*40)
print(classification_report(y_test, rf_predictions))

Training Logistic Regression...
Training Random Forest...

LOGISTIC REGRESSION RESULTS
              precision    recall  f1-score   support

           0       0.76      0.89      0.82       140
           1       0.58      0.35      0.44        60

    accuracy                           0.73       200
   macro avg       0.67      0.62      0.63       200
weighted avg       0.71      0.73      0.71       200


RANDOM FOREST RESULTS
              precision    recall  f1-score   support

           0       0.77      0.91      0.84       140
           1       0.65      0.37      0.47        60

    accuracy                           0.75       200
   macro avg       0.71      0.64      0.65       200
weighted avg       0.73      0.75      0.73       200



In [20]:
import joblib

# Save the winning pipeline to a file
joblib.dump(rf_pipeline, 'credit_risk_model.pkl')
print("Model saved successfully as credit_risk_model.pkl!")

Model saved successfully as credit_risk_model.pkl!
