# Balanced Random Forest

In [1]:
# Import modules
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from pathlib import Path

## Prepare the data

In [2]:
# Read dataset
df = pd.read_csv(Path('../Resources/credit_data.csv'))
df.head()

Unnamed: 0,id,status,duration,credit_history,purpose,amount,savings,employment_duration,installment_rate,other_debtors,...,property,age,other_installment_plans,housing,number_credits,job,people_liable,telephone,foreign_worker,credit_risk
0,1,no checking account,18,all credits at this bank paid back duly,car (used),1049,unknown/no savings account,< 1 yr,< 20,none,...,car or other,21,none,for free,1,skilled employee/official,0 to 2,no,no,good
1,2,no checking account,9,all credits at this bank paid back duly,others,2799,unknown/no savings account,1 <= ... < 4 yrs,25 <= ... < 35,none,...,unknown / no property,36,none,for free,2-3,skilled employee/official,3 or more,no,no,good
2,3,... < 0 DM,12,no credits taken/all credits paid back duly,retraining,841,... < 100 DM,4 <= ... < 7 yrs,25 <= ... < 35,none,...,unknown / no property,23,none,for free,1,unskilled - resident,0 to 2,no,no,good
3,4,no checking account,12,all credits at this bank paid back duly,others,2122,unknown/no savings account,1 <= ... < 4 yrs,20 <= ... < 25,none,...,unknown / no property,39,none,for free,2-3,unskilled - resident,3 or more,no,yes,good
4,5,no checking account,12,all credits at this bank paid back duly,others,2171,unknown/no savings account,1 <= ... < 4 yrs,< 20,none,...,car or other,38,bank,rent,2-3,unskilled - resident,0 to 2,no,yes,good


In [5]:
# Split target column from dataset
y = df['credit_risk']
X = df.drop(columns='credit_risk')

In [6]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [8]:
# Count distinct values
y_train.value_counts()

good    523
bad     227
Name: credit_risk, dtype: int64

In [10]:
# Scale the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

---

## BalancedRandomForestClassifier

In [11]:
# Import BalancedRandomForestClassifier from imblearn
from imblearn.ensemble import BalancedRandomForestClassifier

# Instantiate a BalancedRandomForestClassifier instance
model = BalancedRandomForestClassifier()

# Fit the model to the training data
model.fit(X_train_scaled,y_train)

  warn(
  warn(


In [12]:
# Predict labels for testing features
y_pred = model.predict(X_test_scaled)

In [14]:
# Print classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         bad       0.89      0.90      0.90        73
        good       0.96      0.95      0.96       177

    accuracy                           0.94       250
   macro avg       0.93      0.93      0.93       250
weighted avg       0.94      0.94      0.94       250

