# Student Loan: XGBoost

In this chapter we will apply the `XGBoost` classifier to our student loan data.

## Import Packages

In [None]:
import pandas as pd
import numpy as np
import sklearn
import xgboost
pd.options.display.max_rows = 10

## Read-In Data

In [None]:
df_train = pd.read_csv('../data/student_loan.csv')
df_train

Unnamed: 0,load_id,deal_name,loan_age,cosign,income_annual,upb,monthly_payment,fico,origbalance,mos_to_repay,repay_status,mos_to_balln,paid_label
0,765579,2014_b,56,0,113401.60,36011.11,397.91,814,51453.60,0,0,124,0
1,765580,2014_b,56,1,100742.34,101683.38,1172.10,711,130271.33,0,0,124,0
2,765581,2014_b,56,0,46000.24,49249.37,593.57,772,62918.96,0,0,124,0
3,765582,2014_b,56,0,428958.96,36554.85,404.63,849,48238.73,0,0,125,0
4,765583,2014_b,56,0,491649.96,7022.30,1967.46,815,106124.68,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043306,1808885,2019_c,2,0,152885.00,115363.12,1212.22,798,116834.64,0,0,118,0
1043307,1808886,2019_c,2,0,116480.00,77500.70,831.13,826,79566.03,0,0,118,0
1043308,1808887,2019_c,2,0,96800.00,16156.76,232.34,781,16472.50,0,0,82,0
1043309,1808888,2019_c,2,0,78400.14,77197.03,833.57,777,78135.54,0,0,118,0


## Feature Selection

In [None]:
lst_features = \
    ['loan_age','cosign','income_annual', 'upb',              
    'monthly_payment','fico','origbalance',
    'mos_to_repay','repay_status','mos_to_balln',]    
df_X = df_train[lst_features]
df_X

Unnamed: 0,loan_age,cosign,income_annual,upb,monthly_payment,fico,origbalance,mos_to_repay,repay_status,mos_to_balln
0,56,0,113401.60,36011.11,397.91,814,51453.60,0,0,124
1,56,1,100742.34,101683.38,1172.10,711,130271.33,0,0,124
2,56,0,46000.24,49249.37,593.57,772,62918.96,0,0,124
3,56,0,428958.96,36554.85,404.63,849,48238.73,0,0,125
4,56,0,491649.96,7022.30,1967.46,815,106124.68,0,0,4
...,...,...,...,...,...,...,...,...,...,...
1043306,2,0,152885.00,115363.12,1212.22,798,116834.64,0,0,118
1043307,2,0,116480.00,77500.70,831.13,826,79566.03,0,0,118
1043308,2,0,96800.00,16156.76,232.34,781,16472.50,0,0,82
1043309,2,0,78400.14,77197.03,833.57,777,78135.54,0,0,118


In [None]:
df_y = df_train['paid_label']
df_y

0          0
1          0
2          0
3          0
4          0
          ..
1043306    0
1043307    0
1043308    0
1043309    0
1043310    0
Name: paid_label, Length: 1043311, dtype: int64

## Creating Holdout Sets

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(df_X, df_y, random_state = 0)

In [None]:
X_train = X_train.copy()
X_test = X_test.copy()
y_train = y_train.copy()
y_test = y_test.copy()

## Initial Fit

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print('Actual:    ', y_test.sum())
print('Predicted: ', y_pred.sum())

Actual:     4227
Predicted:  1647


In [None]:
print('Actual:   ', sum(X_test['upb'] * y_test))
print('Predicted: ', sum(X_test['upb'] * y_pred))
print('Ratio:     ', sum(X_test['upb'] * y_pred) / sum(X_test['upb'] * y_test))

Actual:    166234148.19000015
Predicted:  32604191.500000022
Ratio:      0.19613413883370406


In [None]:
sklearn.metrics.f1_score(y_test, y_pred)

0.5110657133129044

## Modifying `scale_pos_weight`

In [None]:
# fit model no training data
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=25)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print('Actual:    ', y_test.sum())
print('Predicted: ', y_pred.sum())

Actual:     4227
Predicted:  6905


In [None]:
print('Actual:   ', sum(X_test['upb'] * y_test))
print('Predicted: ', sum(X_test['upb'] * y_pred))
print('Ratio:     ', sum(X_test['upb'] * y_pred) / sum(X_test['upb'] * y_test))

Actual:    166234148.19000015
Predicted:  128185269.83000004
Ratio:      0.7711127420311289


In [None]:
sklearn.metrics.f1_score(y_test, y_pred)

0.3645346748113546