In [None]:
import time
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import cv2
from DDQN_Agent import DDQNAgent

In [None]:
from Geosteering_gym import Geosteering

In [None]:
def train_func(runs, bd, val, lookahead, gamma, writer):
    if __name__ == '__main__':
        start = time.time()
        env = Geosteering(render_=True,eval=False)
        n_games = runs
        gamma = gamma
        eps_min = 0.01
        eps_dec = 0.9995

        writer_stat = writer
        if writer_stat:
            writer = SummaryWriter(comment="_BD=%s_Val=%s_v1"%(bd,val))

        scores = np.zeros([n_games])
        avg_score = np.zeros([n_games])
        states = (2*(env.look_ahead)+8,)
        bd_actions = 2*env.bd_step + 2
        
        bd_agent = DDQNAgent(gamma=gamma, lr=0.0005, n_actions=bd_actions, n_states=states,
                    batch_size=32, mem_size=25000, replace=1000, 
                    saved_dir='trained network/',
                    env_name='Geosteering_bitdepth_BD=%s_Val=%s_LookAhead=%s_GYM1'%(bd,val,lookahead))
        
        best_score = -np.inf
        result = cv2.VideoWriter('Training.mp4', 
                 cv2.VideoWriter_fourcc(*'MP4V'),
                 5, (1280,720))
        add = 0 
        for i in tqdm(range(n_games)):
            done = False
            observation = env.reset()
            score = 0
            if i%2000 == 0:
                env.render_ = True
            while not done:
                action = bd_agent.choose_action(observation)
                if env.exit == 0 and action == env.bd_step*2+1:
                    action = env.bd_step
                observation_, reward, done, info = env.step(action)

                if i in np.arange(0,2)+add:
                    result.write(env.canvas)
                score += reward
                bd_agent.store_transition(observation, action,
                                           reward, observation_, done)
                bd_agent.learn()

                if writer_stat and bd_agent.learn_idx%5000==4999:
                    loss_plot = bd_agent.running_loss/5000
                    bd_agent.running_loss = 0
                    writer.add_scalar("loss", loss_plot, bd_agent.learn_idx)
                observation = observation_
            if i == max(np.arange(0,2)+add):
                add += 2000
                env.render_ = False

            bd_agent.epsilon = bd_agent.epsilon*eps_dec \
            if bd_agent.epsilon > eps_min else eps_min

            scores[i] = score
            if i>= 100:
                avg_score[i] = np.mean(scores[i-99:i+1])
                
            if avg_score[i] > best_score and i>= 100:
                bd_agent.save_models()
                best_score = avg_score[i]
                      
            if writer_stat:
                writer.add_scalar("reward_100", avg_score[i], i)
                writer.add_scalar("reward", scores[i], i)
                

        end = time.time()
        elapsed_time = (end-start)
        start = end
        print('======== Finished with elapsed time of %.2f' %elapsed_time, 'seconds ========')
        if writer_stat:
            writer.close()

In [None]:
for val in [2]:
    for bd in [2]:
        train_func(runs= 10000, bd=bd, val= val, lookahead= 1, gamma= 0.99, writer=False)