In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

In [None]:
class LoanPredictionModel:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None
        self.dataencoded = None
        self.X_train = self.X_test = self.y_train = self.y_test = None
        self.xgb_model = None
        self.catcols = ['person_gender', 'person_education', 'person_home_ownership',
                        'loan_intent', 'previous_loan_defaults_on_file']
        self.numcols = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate',
                        'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

    def load_data(self):
        print("Loading data...")
        self.data = pd.read_csv(self.file_path)
        print("Missing value:\n", self.data.isnull().sum())

    def preprocess_data(self):
        print("Handling missing values...")
        self.data['person_income'].fillna(self.data['person_income'].mean(), inplace=True)
        self.dataencoded = self.data.copy()

        print("Encoding categorical columns...")
        le = LabelEncoder()
        for col in self.catcols:
            self.dataencoded[col] = le.fit_transform(self.dataencoded[col])

        print("Detecting and capping outliers...")
        for col in self.numcols:
            Q1 = self.dataencoded[col].quantile(0.25)
            Q3 = self.dataencoded[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            self.dataencoded[col] = np.where(
                self.dataencoded[col] < lower_bound, lower_bound,
                np.where(self.dataencoded[col] > upper_bound, upper_bound, self.dataencoded[col])
            )

        print("Removing gender anomalies...")
        self.dataencoded = self.dataencoded[self.dataencoded['person_gender'].isin([0, 1])]

    def split_data(self):
        print("Splitting dataset...")
        X = self.dataencoded.drop(columns=['loan_status'] + self.catcols)
        y = self.dataencoded['loan_status']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

    def tune_and_train_xgboost(self):
        print("Tuning XGBoost model...")
        xgb_params = {
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.2]
        }
        xgb = XGBClassifier(random_state=42)
        xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring='accuracy')
        xgb_grid.fit(self.X_train, self.y_train)
        self.xgb_model = xgb_grid.best_estimator_

        print("Best Parameters:", xgb_grid.best_params_)

    def evaluate_model(self):
        print("Evaluating model...")
        preds = self.xgb_model.predict(self.X_test)
        acc = accuracy_score(self.y_test, preds)
        report = classification_report(self.y_test, preds)
        print("XGBoost Performance:")
        print(report)
        print(f"Accuracy: {acc}")


#CODE INFERENCE
    #penjelasan dan komen agar user tahu format input datanya
    def predict_new(self, input_data: dict):
        """
        melakukan prediksi pada 1 data baru berbentuk dictionary.
        contoh input:
        {
            'person_age': 35,
            'person_income': 50000,
            'person_emp_exp': 10,
            'loan_amnt': 10000,
            'loan_int_rate': 12.5,
            'loan_percent_income': 0.2,
            'cb_person_cred_hist_length': 4,
            'credit_score': 700
        }
        """
        #cek apakah sudah benar
        if self.xgb_model is None:
            raise ValueError("error...")

        #utk ambil urutan & nama kolom yg benar
        required_features = self.X_train.columns.tolist()
        #ubah dictionary jadi dataframe
        input_df = pd.DataFrame([input_data])[required_features]
        #menghasilkan label prediksi
        prediction = self.xgb_model.predict(input_df)[0]
        #menghasilkan probabilitas
        probability = self.xgb_model.predict_proba(input_df)[0][1]

        #balikin ke dictionary agar mudah dibaca streamlit
        return {
            "prediction": int(prediction),
            "probability": float(round(probability, 4))
        }


➥ fungsi : untuk memprediksi status peminjaman berdasarkan input data baru dalam bentuk dictionary python.

In [None]:
model = LoanPredictionModel("Dataset_A_loan.csv")
model.load_data()

Loading data...
Missing value:
 person_age                           0
person_gender                        0
person_education                     0
person_income                     2250
person_emp_exp                       0
person_home_ownership                0
loan_amnt                            0
loan_intent                          0
loan_int_rate                        0
loan_percent_income                  0
cb_person_cred_hist_length           0
credit_score                         0
previous_loan_defaults_on_file       0
loan_status                          0
dtype: int64
Handling missing values...
Encoding categorical columns...
Detecting and capping outliers...
Removing gender anomalies...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data['person_income'].fillna(self.data['person_income'].mean(), inplace=True)


In [None]:
model.preprocess_data()

Handling missing values...
Encoding categorical columns...
Detecting and capping outliers...
Removing gender anomalies...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.data['person_income'].fillna(self.data['person_income'].mean(), inplace=True)


In [None]:
model.split_data()

Splitting dataset...


In [None]:
model.tune_and_train_xgboost()

Tuning XGBoost model...
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300}
