In [1]:
import pandas as pd

In [2]:
# Importing the data
df_credit = pd.read_csv("german_credit_data.csv", index_col=0)
df_credit

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,low
1,22,female,2,own,little,moderate,5951,48,radio/TV,high
2,49,male,1,own,little,,2096,12,education,low
3,45,male,2,free,little,little,7882,42,furniture/equipment,low
4,53,male,2,free,little,little,4870,24,car,high
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,,1736,12,furniture/equipment,low
996,40,male,3,own,little,little,3857,30,car,low
997,38,male,2,own,little,,804,12,radio/TV,low
998,23,male,2,free,little,little,1845,45,radio/TV,high


In [4]:
# Remove missing values
df_credit.dropna()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
1,22,female,2,own,little,moderate,5951,48,radio/TV,high
3,45,male,2,free,little,little,7882,42,furniture/equipment,low
4,53,male,2,free,little,little,4870,24,car,high
7,35,male,3,rent,little,moderate,6948,36,car,low
9,28,male,3,own,little,moderate,5234,30,car,high
...,...,...,...,...,...,...,...,...,...,...
989,48,male,1,own,little,moderate,1743,24,radio/TV,low
993,30,male,3,own,little,little,3959,36,furniture/equipment,low
996,40,male,3,own,little,little,3857,30,car,low
998,23,male,2,free,little,little,1845,45,radio/TV,high


In [5]:
# Creating the X and y
X = df_credit[["Age", "Job", "Credit amount", "Duration"]].values
y = df_credit["Risk"].values

In [6]:
# Label encoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [7]:
import joblib

joblib.dump(encoder, "saved_models/encoder.pkl")

['saved_models/encoder.pkl']

In [8]:
# split test train
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
# train model
# from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier = XGBClassifier(learning_rate=0.05, 
                    n_estimators=200, 
                    objective='binary:logistic',
                    eval_metric = "logloss",
                    nthread=2, 
                    random_state=2
                   )
classifier.fit(X_train, y_train)

In [10]:
# Test model
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score

In [11]:
# Confusion matrix and classification report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score
print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))

Confusion matrix: 
 [[ 12  47]
 [ 14 127]]


              precision    recall  f1-score   support

           0       0.46      0.20      0.28        59
           1       0.73      0.90      0.81       141

    accuracy                           0.69       200
   macro avg       0.60      0.55      0.54       200
weighted avg       0.65      0.69      0.65       200



In [12]:
# Save Model
import joblib
joblib.dump(classifier, "saved_models/model.pkl")

['saved_models/model.pkl']

In [13]:
# Make predictions
# Read models
classifier_loaded = joblib.load("saved_models/model.pkl")
encoder_loaded = joblib.load("saved_models/encoder.pkl")

In [14]:
# Prediction set
X_manual_test = [[24, 3, 20000, 36]]
print("Test prediction with parameters (Age, Job, Credit amount, Duration): ", X_manual_test)

prediction = encoder_loaded.inverse_transform(classifier.predict(X_manual_test))
print("Predicted credit risk:", prediction)

Test prediction with parameters (Age, Job, Credit amount, Duration):  [[24, 3, 20000, 36]]
Predicted credit risk: ['high']
