In [1]:
# !pip install d3rlpy

In [2]:
import matplotlib.pyplot as plt
!matplotlib inline

/bin/bash: matplotlib: command not found


In [3]:
import pandas as pd
import d3rlpy
import numpy as np
import pandas as pd

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [4]:
def previous_rate_process(data, mean=None):
    data['Previous_Rate_Known'] = data['Previous_Rate'] != -10
    prev_rate = data['Previous_Rate'].copy()
    if mean:
        prev_rate_mean = mean
    else:
        prev_rate_mean = prev_rate[prev_rate != -10].mean()
    prev_rate[prev_rate == -10] = prev_rate_mean
    data['Previous_Rate'] = prev_rate
    return prev_rate_mean

prev_rate_mean = previous_rate_process(train)
previous_rate_process(test, prev_rate_mean)
train.head()

Unnamed: 0,Tier,FICO,Term,Amount,Previous_Rate,Competition_rate,Rate,Cost_Funds,Partner Bin,Car_Type_N,Car_Type_R,Car_Type_U,Accept,Previous_Rate_Known
0,2,725,72,30500.0,5.0,6.09,4.99,1.12,2,0,1,0,0,True
1,1,739,60,25995.0,7.625269,4.79,4.79,1.959,2,0,0,1,0,False
2,1,781,60,39000.0,7.625269,4.25,4.25,1.12,2,1,0,0,0,False
3,2,718,72,30000.0,7.625269,5.39,5.44,1.3363,1,1,0,0,1,False
4,2,703,72,38000.0,7.625269,5.75,6.29,1.1338,1,0,0,1,0,False


In [5]:
train.describe()

Unnamed: 0,Tier,FICO,Term,Amount,Previous_Rate,Competition_rate,Rate,Cost_Funds,Partner Bin,Car_Type_N,Car_Type_R,Car_Type_U,Accept
count,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0,145659.0
mean,1.928429,726.695556,56.822441,25996.81976,7.625269,4.807957,5.623998,1.329209,2.029473,0.571815,0.226865,0.20132,0.220041
std,1.050519,44.727756,11.201037,11125.968255,1.323102,0.586018,1.545418,0.278368,0.911097,0.494817,0.418806,0.400988,0.414276
min,1.0,594.0,36.0,4526.62,3.0,2.99,2.45,1.02,1.0,0.0,0.0,0.0,0.0
25%,1.0,692.0,48.0,17753.45,7.625269,4.39,4.49,1.11,1.0,0.0,0.0,0.0,0.0
50%,2.0,726.0,60.0,25000.0,7.625269,4.79,5.09,1.2625,2.0,1.0,0.0,0.0,0.0
75%,3.0,762.0,60.0,33000.0,7.625269,5.19,6.39,1.4194,3.0,1.0,0.0,0.0,0.0
max,4.0,852.0,72.0,100000.0,24.0,6.45,15.53,2.127,3.0,1.0,1.0,1.0,1.0


In [6]:
from sklearn.linear_model import LogisticRegression

X = train[['Tier', 'FICO', 'Term', 'Amount', 'Previous_Rate', 'Competition_rate',
           'Cost_Funds', 'Partner Bin', 'Car_Type_N', 'Car_Type_R', 'Car_Type_U', 'Rate']].values
y = train['Accept'].values

clf = LogisticRegression(random_state=0, penalty='none', fit_intercept=True, multi_class='ovr', n_jobs = 4).fit(X, y)
probs_train = clf.predict_proba(X)[:,1]



In [7]:
from sklearn.model_selection import train_test_split

train_data, eval_data = train_test_split(train[:10000], test_size=0.15, shuffle=True)

In [8]:
observations = train_data[['Tier', 'FICO', 'Term', 'Amount', 'Previous_Rate', 'Competition_rate',
                           'Cost_Funds', 'Partner Bin', 'Car_Type_N', 'Car_Type_R', 'Car_Type_U']]
actions = train_data['Rate']

terminals = np.zeros_like(actions)
terminals[-1] = 1

In [9]:
observations = observations.values
actions = actions.values
actions = actions.reshape(-1,1)

In [10]:
def prob_accept(action, state):
    if isinstance(action, (int, float)):
        action = np.array(action)
    action = action.reshape(-1,1)
    if len(state.shape)==1:
        state = state.reshape(1,-1)
    x = np.concatenate((state, action),axis=1)
    probs_pred = clf.predict_proba(x)[:,1]
    return probs_pred

prob_accept(actions, observations)

array([0.62314798, 0.48864422, 0.48851737, ..., 0.21507262, 0.72913991,
       0.02747331])

In [11]:
def p_default(state):
    fico = state[:, 1]
    default_prob = fico.copy()
    default_prob[default_prob < 500] = 0.41
    default_prob[default_prob >= 750] = 0.01
    default_prob[default_prob >= 700] = 0.044
    default_prob[default_prob >= 650] = 0.089
    default_prob[default_prob >= 600] = 0.158
    default_prob[default_prob >= 550] = 0.225
    default_prob[default_prob >= 500] = 0.284
    return default_prob
    # return np.random.random(size=fico.shape)

def reward(action, state, risk_free = 0.04, loss_ratio=0.5):

    if len(state.shape)==1:
        state = state.reshape(1,-1)
    p_Accept= prob_accept(action, state)

    Sum_loan = state[:,3]
    Term = state[:,2]/12

    p_return = p_default(state)

    loss_given_default = loss_ratio*Sum_loan
    action_rate = action/100
    reward = (p_Accept*(Sum_loan*p_return*((1+action_rate)**Term-(1+risk_free)**Term)-(1-p_return)*loss_given_default))/Sum_loan

    return reward

def reward_vec(action, state, risk_free = 0.2, loss_ratio=0.5):
    if len(state.shape)==1:
        state = state.reshape(1,-1)

    p_Accept= prob_accept(action, state)

    Sum_loan = state[:,3]
    Term = state[:,2]/12

    p_return = p_default(state)

    loss_given_default = loss_ratio*Sum_loan
    action = action/100
    reward = (p_Accept*(Sum_loan*p_return*((1+action)**Term-(1+risk_free)**Term)-(1-p_return)*loss_given_default))/Sum_loan

    return reward
#what is a risk-free rate??
rewards = reward_vec(actions, observations, risk_free = 0.04, loss_ratio=0.5)

print(rewards)
# action = 0.3
# state  = observations[7,:]
# print(state.shape)
# rewardina = reward(action, state, risk_free = 0.2, loss_ratio=0.5)

[[-0.3081207  -0.2324073  -0.2322023  ... -0.09739541 -0.34719793
  -0.01359111]
 [-0.30730391 -0.22958914 -0.22901206 ... -0.09605458 -0.34402302
  -0.01357186]
 [-0.3070353  -0.22866235 -0.22795624 ... -0.09562471 -0.3429921
  -0.01356569]
 ...
 [-0.30762566 -0.23069929 -0.23027251 ... -0.09657652 -0.3452663
  -0.01357935]
 [-0.30812825 -0.23243333 -0.23223162 ... -0.09740805 -0.34722756
  -0.01359129]
 [-0.30846189 -0.23358451 -0.2335254  ... -0.09797179 -0.34854331
  -0.01359938]]


In [12]:
import gym

class Env_rl_bank(gym.Env):
    def __init__(self, dataset):
        super(Env_rl_bank, self).__init__()
        self.dataset = dataset
        self.curr_state = 0
        self.n_observation = dataset.shape[0]
        self.observation_tensor = dataset[['Tier', 'FICO', 'Term', 'Amount', 'Previous_Rate', 'Competition_rate',
        'Cost_Funds', 'Partner Bin', 'Car_Type_N', 'Car_Type_R','Car_Type_U']].values

        self.action_space = gym.spaces.Box(low=np.array([0]), high=np.array([100]), dtype=np.float16)

        self.observation_space = gym.spaces.Box(low = -10*np.ones(11)[None,:], high =  10e+10*np.ones(11)[None,:],
                                               shape=(1,11), dtype=np.float16)
    def step(self, action):
        #update state
        #return rewarn
        #done is an indicator that the whole dataset is
        # next_s, r, done, _ = env.step(a)
        state  =  self.observation_tensor[self.curr_state]
        if self.curr_state >= self.n_observation-2:
            done = True
        else:

            done = False
        self.curr_state +=1
        next_s = self.observation_tensor[self.curr_state]
        rew = reward_vec(action, state, risk_free = 0.04, loss_ratio=0.5)

        info = f'reward = {rew}, at state is done = {done}'

        return next_s, rew, done, info

    def reset(self):
        self.curr_state = 0
        state  =  self.observation_tensor[self.curr_state]
        return state
#     def sample(self, ...):
#         return


In [13]:
d3rlpy.__version__

'1.1.1'

In [14]:
# prepare dataset
#dataset, env = d3rlpy.datasets.get_d4rl('hopper-medium-v0')

# prepare algorithm
#do not forget  about cql parametres in a paper
cql = d3rlpy.algos.CQL(use_gpu=True, n_steps=5,
                       batch_size = 256,
                        hidden_units = [64, 64, 64, 64], weight_decay =0.0001,
gamma = 0.999, n_critics =2,alpha_threshold=10,conservative_weight=5,)
                       

       
                    

env  = Env_rl_bank(train_data)
terminals = np.random.randint(2, size = actions.shape[0])

dataset = d3rlpy.dataset.MDPDataset(
    observations=observations,
    actions=actions,
    rewards=rewards,
    terminals=terminals,

)
eval_obs = eval_data[['Tier', 'FICO', 'Term', 'Amount', 'Previous_Rate', 'Competition_rate',
        'Cost_Funds', 'Partner Bin', 'Car_Type_N', 'Car_Type_R','Car_Type_U']].values
eval_actions = eval_data['Rate'].values.reshape(-1,1)

eval_rewards = reward_vec(eval_actions, eval_obs, risk_free = 0.04, loss_ratio=0.5)
eval_terminals = np.zeros_like(eval_actions)

eval_dataset = d3rlpy.dataset.MDPDataset(
    observations=eval_obs,
    actions=eval_actions,
    rewards=eval_rewards,
    terminals=eval_terminals,)


print(dataset.get_action_size())
#print(dataset.episodes)
# correct format for observations, actions, rewards, terminals
# observation = [...,...,..,]
# rewards = reward()
# actions =interest rate path
#terminals = 

# First of all, you need to prepare your logged data. 
# In this tutorial, let’s use randomly generated data.
# terminals represents the last step of episodes. 
# If terminals[i] == 1.0, i-th step is the terminal state. 
# Otherwise you need to set zeros for non-terminal states.

# train
cql.fit(
    dataset,
    eval_episodes=dataset,
    n_epochs=20,

    scorers={'environment': d3rlpy.metrics.evaluate_on_environment(env),
            'td_error': d3rlpy.metrics.td_error_scorer,},)



  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  self.high = high.astype(self.dtype)


1
[2m2023-10-22 11:53:40[0m [[32m[1mdebug    [0m] [1mRoundIterator is selected.[0m
[2m2023-10-22 11:53:40[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/CQL_20231022115340[0m
[2m2023-10-22 11:53:40[0m [[32m[1mdebug    [0m] [1mBuilding models...[0m
[2m2023-10-22 11:53:43[0m [[32m[1mdebug    [0m] [1mModels have been built.[0m
[2m2023-10-22 11:53:43[0m [[32m[1minfo     [0m] [1mParameters are saved to d3rlpy_logs/CQL_20231022115340/params.json[0m [36mparams[0m=[35m{'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 0.0001, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_learning_rate': 0.0001, 'alpha_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_threshold': 10, 'bat

Epoch 1/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 11:55:36[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=1 step=33[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004996241945208925, 'time_algorithm_update': 0.05783060102751761, 'temp_loss': 1.6237169468041621, 'temp': 0.9983071594527273, 'alpha_loss': -5.713383327830922, 'alpha': 1.0015706546378857, 'critic_loss': 10.68881232810743, 'actor_loss': -0.2387076055235935, 'time_step': 0.05842078093326453, 'environment': -2340.686983344406, 'td_error': 2.746102146156842}[0m [36mstep[0m=[35m33[0m
[2m2023-10-22 11:55:36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_33.pt[0m


Epoch 2/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 11:57:28[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=2 step=66[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0005211974635268703, 'time_algorithm_update': 0.03426418159947251, 'temp_loss': 1.6371877590815227, 'temp': 0.9950189861384305, 'alpha_loss': -1.9886429960077459, 'alpha': 1.0039760343956226, 'critic_loss': 11.551553653948234, 'actor_loss': 0.3965475062529246, 'time_step': 0.03487339164271499, 'environment': -2340.686983344406, 'td_error': 2.4598990317365392}[0m [36mstep[0m=[35m66[0m
[2m2023-10-22 11:57:28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_66.pt[0m


Epoch 3/20:   0%|          | 0/33 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[2m2023-10-22 11:59:22[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=3 step=99[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00044319123932809543, 'time_algorithm_update': 0.02808962446270567, 'temp_loss': 1.6321363593592788, 'temp': 0.9917215072747433, 'alpha_loss': 2.231393072867032, 'alpha': 1.004785758076292, 'critic_loss': 2.5081376541744578, 'actor_loss': 1.3371190266175703, 'time_step': 0.028604442423040218, 'environment': -1874.6092434976824, 'td_error': 5.523020461788358}[0m [36mstep[0m=[35m99[0m
[2m2023-10-22 11:59:22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_99.pt[0m


Epoch 4/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:01:15[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=4 step=132[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00039578206611402106, 'time_algorithm_update': 0.021527608235677082, 'temp_loss': 1.592938806071426, 'temp': 0.9884913672100414, 'alpha_loss': 9.315725355437309, 'alpha': 1.0018555395530933, 'critic_loss': -5.213120980696245, 'actor_loss': 2.870165037386345, 'time_step': 0.02198583429509943, 'environment': -1874.6092434976824, 'td_error': 15.084121294814512}[0m [36mstep[0m=[35m132[0m
[2m2023-10-22 12:01:15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_132.pt[0m


Epoch 5/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:03:08[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=5 step=165[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003860791524251302, 'time_algorithm_update': 0.021605101498690518, 'temp_loss': 1.52327996673006, 'temp': 0.9853008859085314, 'alpha_loss': 18.889006412390508, 'alpha': 0.9958358623764731, 'critic_loss': -2.429726831840746, 'actor_loss': 5.260635578271114, 'time_step': 0.02205169562137488, 'environment': -1874.6092434976824, 'td_error': 24.937937001131466}[0m [36mstep[0m=[35m165[0m
[2m2023-10-22 12:03:08[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_165.pt[0m


Epoch 6/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:05:01[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=6 step=198[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004002975695060961, 'time_algorithm_update': 0.021437103098089046, 'temp_loss': 1.4339276335456155, 'temp': 0.9822487379565383, 'alpha_loss': 32.75629846977465, 'alpha': 0.9891944328943888, 'critic_loss': -21.661388455015242, 'actor_loss': 8.725455891002309, 'time_step': 0.021897641095248135, 'environment': -1874.6092434976824, 'td_error': 57.01309185949101}[0m [36mstep[0m=[35m198[0m
[2m2023-10-22 12:05:01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_198.pt[0m


Epoch 7/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:06:54[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=7 step=231[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003809350909608783, 'time_algorithm_update': 0.02142929308342211, 'temp_loss': 1.2007782793406285, 'temp': 0.9794514811400211, 'alpha_loss': 45.94932082205108, 'alpha': 0.9822942614555359, 'critic_loss': 45.381129987312086, 'actor_loss': 11.867464499040084, 'time_step': 0.021870230183456883, 'environment': -1874.6092434976824, 'td_error': 79.42596346357463}[0m [36mstep[0m=[35m231[0m
[2m2023-10-22 12:06:54[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_231.pt[0m


Epoch 8/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:08:46[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=8 step=264[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003866860360810251, 'time_algorithm_update': 0.02151966817451246, 'temp_loss': 1.1622975197705356, 'temp': 0.9768798730590127, 'alpha_loss': 49.338066563461766, 'alpha': 0.9760470787684122, 'critic_loss': 16.491232322924066, 'actor_loss': 11.8220346624201, 'time_step': 0.0219673676924272, 'environment': -1874.6092434976824, 'td_error': 104.93138094197766}[0m [36mstep[0m=[35m264[0m
[2m2023-10-22 12:08:46[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_264.pt[0m


Epoch 9/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:10:39[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=9 step=297[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003819321141098485, 'time_algorithm_update': 0.021539738683989555, 'temp_loss': 1.0699425809311145, 'temp': 0.9744376269253817, 'alpha_loss': 67.68974477594548, 'alpha': 0.9701059623198076, 'critic_loss': -37.969658475933656, 'actor_loss': 15.542186014580004, 'time_step': 0.021982633706295128, 'environment': -1874.6092434976824, 'td_error': 146.65724929379633}[0m [36mstep[0m=[35m297[0m
[2m2023-10-22 12:10:39[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_297.pt[0m


Epoch 10/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:12:32[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=10 step=330[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00038440299756599194, 'time_algorithm_update': 0.021460627064560398, 'temp_loss': 0.6150846029772903, 'temp': 0.9722709583513665, 'alpha_loss': 82.41363825942531, 'alpha': 0.9639386137326559, 'critic_loss': -54.66073492801551, 'actor_loss': 20.078582474679656, 'time_step': 0.021905053745616566, 'environment': -1874.6092434976824, 'td_error': 217.59944362973732}[0m [36mstep[0m=[35m330[0m
[2m2023-10-22 12:12:32[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_330.pt[0m


Epoch 11/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:14:26[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=11 step=363[0m [36mepoch[0m=[35m11[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0003883910901618726, 'time_algorithm_update': 0.02154266473018762, 'temp_loss': 0.4119911187074401, 'temp': 0.9709123969078064, 'alpha_loss': 100.77102406819661, 'alpha': 0.9577433503035343, 'critic_loss': -86.17533955429539, 'actor_loss': 25.34544037327622, 'time_step': 0.021990523193821762, 'environment': -1874.6092434976824, 'td_error': 325.03443405384115}[0m [36mstep[0m=[35m363[0m
[2m2023-10-22 12:14:26[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_363.pt[0m


Epoch 12/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:16:34[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=12 step=396[0m [36mepoch[0m=[35m12[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.000431602651422674, 'time_algorithm_update': 0.027480009830359257, 'temp_loss': 0.11017769445298296, 'temp': 0.9701588876319654, 'alpha_loss': 121.6657846624201, 'alpha': 0.9514844706564238, 'critic_loss': -90.16615676879883, 'actor_loss': 30.87291284040971, 'time_step': 0.02798249504782937, 'environment': -1874.6092434976824, 'td_error': 442.908262552799}[0m [36mstep[0m=[35m396[0m
[2m2023-10-22 12:16:34[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_396.pt[0m


Epoch 13/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:18:43[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=13 step=429[0m [36mepoch[0m=[35m13[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004086060957475142, 'time_algorithm_update': 0.03166440761450565, 'temp_loss': -0.13216229545121844, 'temp': 0.9699635884978555, 'alpha_loss': 146.7492439963601, 'alpha': 0.9452715407718312, 'critic_loss': -118.91581841671106, 'actor_loss': 38.15061326460405, 'time_step': 0.032144170818906845, 'environment': -1874.6092434976824, 'td_error': 650.7476795278353}[0m [36mstep[0m=[35m429[0m
[2m2023-10-22 12:18:43[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_429.pt[0m


Epoch 14/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:20:52[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=14 step=462[0m [36mepoch[0m=[35m14[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004115610411672881, 'time_algorithm_update': 0.03212561751856948, 'temp_loss': -0.30598966924078536, 'temp': 0.9704222715262211, 'alpha_loss': 179.71335671164772, 'alpha': 0.9389442512483308, 'critic_loss': -153.7265731349136, 'actor_loss': 48.11664904970111, 'time_step': 0.03260627659884366, 'environment': -1874.6092434976824, 'td_error': 994.103305361667}[0m [36mstep[0m=[35m462[0m
[2m2023-10-22 12:20:52[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_462.pt[0m


Epoch 15/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:23:01[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=15 step=495[0m [36mepoch[0m=[35m15[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004231785282944188, 'time_algorithm_update': 0.03222853487188166, 'temp_loss': -0.7074340256777677, 'temp': 0.9716423713799679, 'alpha_loss': 219.27998213334516, 'alpha': 0.9324591466874788, 'critic_loss': -177.7603514700225, 'actor_loss': 60.0228298071659, 'time_step': 0.03272714759364272, 'environment': -1874.6092434976824, 'td_error': 1389.1026779294057}[0m [36mstep[0m=[35m495[0m
[2m2023-10-22 12:23:01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_495.pt[0m


Epoch 16/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:24:54[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=16 step=528[0m [36mepoch[0m=[35m16[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004634784929680102, 'time_algorithm_update': 0.031099124388261276, 'temp_loss': -0.9518701759251681, 'temp': 0.9737807765151515, 'alpha_loss': 261.31396438136244, 'alpha': 0.9259273138913241, 'critic_loss': -217.0195890484434, 'actor_loss': 73.54899227257931, 'time_step': 0.0316428415703051, 'environment': -1874.6092434976824, 'td_error': 2130.369864562529}[0m [36mstep[0m=[35m528[0m
[2m2023-10-22 12:24:54[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_528.pt[0m


Epoch 17/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:26:48[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=17 step=561[0m [36mepoch[0m=[35m17[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004762013753255208, 'time_algorithm_update': 0.031874223188920456, 'temp_loss': -1.174212117989858, 'temp': 0.9767458818175576, 'alpha_loss': 319.4331267385772, 'alpha': 0.9193204388473973, 'critic_loss': -253.7155401056463, 'actor_loss': 92.00234291770242, 'time_step': 0.032428040648951675, 'environment': -1874.6092434976824, 'td_error': 3158.7772475582924}[0m [36mstep[0m=[35m561[0m
[2m2023-10-22 12:26:48[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_561.pt[0m


Epoch 18/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:28:41[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=18 step=594[0m [36mepoch[0m=[35m18[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00043946323972759825, 'time_algorithm_update': 0.027440331198952415, 'temp_loss': -1.4111106305411367, 'temp': 0.9803627043059377, 'alpha_loss': 383.92847234552556, 'alpha': 0.9126209183172747, 'critic_loss': -291.0142563328599, 'actor_loss': 113.43180939645478, 'time_step': 0.027950770927198006, 'environment': -1874.6092434976824, 'td_error': 4656.6735249392195}[0m [36mstep[0m=[35m594[0m
[2m2023-10-22 12:28:41[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_594.pt[0m


Epoch 19/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:30:35[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=19 step=627[0m [36mepoch[0m=[35m19[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004314870545358369, 'time_algorithm_update': 0.028003974394364792, 'temp_loss': -1.7222281513792095, 'temp': 0.9846776001381151, 'alpha_loss': 461.1324990012429, 'alpha': 0.905869747653152, 'critic_loss': -325.61476967551494, 'actor_loss': 139.36203211004084, 'time_step': 0.02850619951883952, 'environment': -1874.6092434976824, 'td_error': 6751.576025436997}[0m [36mstep[0m=[35m627[0m
[2m2023-10-22 12:30:35[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_627.pt[0m


Epoch 20/20:   0%|          | 0/33 [00:00<?, ?it/s]

[2m2023-10-22 12:32:28[0m [[32m[1minfo     [0m] [1mCQL_20231022115340: epoch=20 step=660[0m [36mepoch[0m=[35m20[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0004469625877611565, 'time_algorithm_update': 0.029152516162756718, 'temp_loss': -1.9584094972321482, 'temp': 0.9895264452153986, 'alpha_loss': 548.5615983442826, 'alpha': 0.8990937814568029, 'critic_loss': -337.4928329930161, 'actor_loss': 169.1609797622218, 'time_step': 0.029675881067911785, 'environment': -1874.6092434976824, 'td_error': 9328.10997401086}[0m [36mstep[0m=[35m660[0m
[2m2023-10-22 12:32:28[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/CQL_20231022115340/model_660.pt[0m


[(1,
  {'time_sample_batch': 0.0004996241945208925,
   'time_algorithm_update': 0.05783060102751761,
   'temp_loss': 1.6237169468041621,
   'temp': 0.9983071594527273,
   'alpha_loss': -5.713383327830922,
   'alpha': 1.0015706546378857,
   'critic_loss': 10.68881232810743,
   'actor_loss': -0.2387076055235935,
   'time_step': 0.05842078093326453,
   'environment': -2340.686983344406,
   'td_error': 2.746102146156842}),
 (2,
  {'time_sample_batch': 0.0005211974635268703,
   'time_algorithm_update': 0.03426418159947251,
   'temp_loss': 1.6371877590815227,
   'temp': 0.9950189861384305,
   'alpha_loss': -1.9886429960077459,
   'alpha': 1.0039760343956226,
   'critic_loss': 11.551553653948234,
   'actor_loss': 0.3965475062529246,
   'time_step': 0.03487339164271499,
   'environment': -2340.686983344406,
   'td_error': 2.4598990317365392}),
 (3,
  {'time_sample_batch': 0.00044319123932809543,
   'time_algorithm_update': 0.02808962446270567,
   'temp_loss': 1.6321363593592788,
   'temp': 0.9

# Inference

In [18]:
from sklearn.model_selection import train_test_split
test_data = pd.read_csv('test_data.csv')

In [32]:
# use policy  and collect reward??
# How to check???

test_actions = test_data['Rate'].values
test_states = test_data[['Tier', 'FICO', 'Term', 'Amount', 'Previous_Rate', 'Competition_rate',
        'Cost_Funds', 'Partner Bin', 'Car_Type_N', 'Car_Type_R','Car_Type_U']].values
hist_rewards_test = reward_vec(test_actions, test_states, risk_free = 0.04, loss_ratio=0.5)
env  = Env_rl_bank(test_data)
state = env.reset()
rewards_cql = []
for step_i in range(len(test_data)):
    actions = cql.predict(state)
    next_state, rew, done, info = env.step(test_actions)
    rewards_cql.append(rew)
    state = next_state




  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")
  self.high = high.astype(self.dtype)


AssertionError: Input must have batch dimension.

In [29]:
cum_sum_rew_cql = np.cumsum(rewards_cql)
cum_sum_rew_hist = np.cumsum(hist_rewards_test)

In [30]:
cum_sum_rew_hist, cum_sum_rew_cql

(array([-1.47296613e-03, -1.14274973e-01, -1.19916873e-01, ...,
        -2.77118592e+03, -2.77119159e+03, -2.77133017e+03]),
 array([], dtype=float64))