In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error

In [None]:
disp_df = pd.read_parquet("data/disposition.parquet", engine='fastparquet')
client_df = pd.read_parquet("data/client.parquet", engine='fastparquet')
loans_df = pd.read_parquet("data/loan.parquet", engine='fastparquet')

In [None]:
def gen_fico(status):
    if status == "A":
        return np.random.normal(loc=790, scale=75, size=1)[0].round()
    elif status == "B":
        return np.random.normal(loc=760, scale=75, size=1)[0].round()
    elif status == "C":
        return np.random.normal(loc=675, scale=75, size=1)[0].round()
    else:
        return np.random.normal(loc=540, scale=75, size=1)[0].round()

def convert_status(status):
    if status == "A":
        return 0
    elif status == "B":
        return 1
    elif status == "C":
        return 2
    else:
        return 3

In [None]:
all_data = pd.merge(
    loans_df,
    disp_df[(disp_df.type == "Owner")],
    on='account_id',
    how='left'
)
all_data = pd.merge(
    all_data,
    client_df.rename(columns={'fulldate': 'dob'}),
    on='client_id',
    how='left'
)
all_data['loannum'] = all_data['loan_id'].str[1:].apply(pd.to_numeric)
all_data = all_data[(all_data['loannum'] < 7308)]
np.random.seed(9456)
all_data['fico'] = all_data['status'].apply(gen_fico)
all_data['status'] = all_data['status'].apply(convert_status)



print(all_data)

In [None]:
model_data = all_data[["status", "age", "district_id", "amount", "duration", "fico"]].copy()
cols = ['age', 'district_id', 'duration', 'fico']
model_data[cols] = model_data[cols].map(np.int64)
X, y = model_data.drop('status', axis=1), model_data[['status']]
print(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)
            
# instantiate the classifier 
xgb_model = xgb.XGBClassifier(objective="multi:softprob", learning_rate=0.00117)


# fit the classifier to the training data
xgb_model.fit(X_train, y_train)


y_pred = xgb_model.predict(X_test)

accuracy = xgb_model.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
xgb_model.save_model('model/xgb_final_model.json')