In [17]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

In [4]:
disp_df = pd.read_parquet("data/disposition.parquet", engine='fastparquet')
client_df = pd.read_parquet("data/client.parquet", engine='fastparquet')
loans_df = pd.read_parquet("data/loan.parquet", engine='fastparquet')

In [12]:
def gen_fico(status):
    if status == "A":
        return np.random.normal(loc=790, scale=75, size=1)[0].round()
    elif status == "B":
        return np.random.normal(loc=760, scale=75, size=1)[0].round()
    elif status == "C":
        return np.random.normal(loc=675, scale=75, size=1)[0].round()
    else:
        return np.random.normal(loc=540, scale=75, size=1)[0].round()

def convert_status(status):
    if status == "A":
        return 0
    elif status == "B":
        return 1
    elif status == "C":
        return 2
    else:
        return 3

In [13]:
all_data = pd.merge(
    loans_df,
    disp_df[(disp_df.type == "Owner")],
    on='account_id',
    how='left'
)
all_data = pd.merge(
    all_data,
    client_df.rename(columns={'fulldate': 'dob'}),
    on='client_id',
    how='left'
)
all_data['loannum'] = all_data['loan_id'].str[1:].apply(pd.to_numeric)
all_data = all_data[(all_data['loannum'] < 7308)]
np.random.seed(9456)
all_data['fico'] = all_data['status'].apply(gen_fico)
all_data['status'] = all_data['status'].apply(convert_status)



print(all_data)

       loan_id account_id    amount  duration  payments  status   fulldate  \
0    L00005657  A00003354    4980.0      12.0     415.0       0 2019-05-07   
1    L00006234  A00006061    5148.0      12.0     429.0       2 2023-03-30   
2    L00006699  A00008330    7656.0      24.0     319.0       0 2019-01-30   
3    L00006688  A00008268    8616.0      24.0     359.0       2 2022-07-29   
4    L00006312  A00006453   10944.0      36.0     304.0       2 2023-03-19   
..         ...        ...       ...       ...       ...     ...        ...   
677  L00005569  A00002936  504000.0      60.0    8400.0       2 2022-11-22   
678  L00005132  A00000817  538500.0      60.0    8975.0       2 2019-12-20   
679  L00005447  A00002335  541200.0      60.0    9020.0       3 2022-09-14   
680  L00006791  A00008926  566640.0      60.0    9444.0       2 2022-11-25   
681  L00006534  A00007542  590820.0      60.0    9847.0       2 2022-08-21   

     location purpose risk_category  ...                       

In [14]:
model_data = all_data[["status", "age", "district_id", "amount", "duration", "fico"]].copy()
cols = ['age', 'district_id', 'duration', 'fico']
model_data[cols] = model_data[cols].map(np.int64)
X, y = model_data.drop('status', axis=1), model_data[['status']]
print(y)

     status
0         0
1         2
2         0
3         2
4         2
..      ...
677       2
678       2
679       3
680       2
681       2

[681 rows x 1 columns]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)
            
# instantiate the classifier 
xgb_model = xgb.XGBClassifier(objective="multi:softprob", learning_rate=0.00117)


# fit the classifier to the training data
xgb_model.fit(X_train, y_train)


y_pred = xgb_model.predict(X_test)

accuracy = xgb_model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 75.56%


In [22]:
xgb_model.save_model('model/xgb_final_model.json')

In [None]:
print(y_train.dtypes)

status    category
dtype: object
