In [1]:
import os, sys
import warnings
import random
import time

import gym

import numpy as np
from skimage import transform, util, color
from collections import deque
import matplotlib.pyplot as plt

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [3]:
env = gym.make('CartPole-v0')
env = env.unwrapped
env.seed(1)

[1]

In [4]:
## ENVIRONMENT Hyperparameters
state_size = 4
action_size = env.action_space.n

## TRAINING Hyperparameters
max_episodes = 10000
learning_rate = 0.01
gamma = 0.95 # Discount rate

In [5]:
def discount_and_normalize(episode_rewards):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)
    
    return discounted_episode_rewards

In [6]:
class PGN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(4, 10)
        self.fc2 = nn.Linear(10, 2)
        self.fc3 = nn.Linear(2, 2)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        x = F.log_softmax(x, dim=-1)
        return x

class NNet():
    def __init__(self):
        self.nnet = PGN().cuda()
        
    def train(self, states, actions, discounted_episode_rewards):
        optimizer = optim.Adam(self.nnet.parameters(), lr=learning_rate)
        self.nnet.train()
        states = torch.FloatTensor(states).contiguous().cuda()
        actions = torch.LongTensor(actions).contiguous().cuda()
        discounted_episode_rewards = torch.FloatTensor(discounted_episode_rewards).contiguous().cuda()
        
        states, actions = Variable(states), Variable(actions) ### !!! I WAS BREAKING THE COMPTUATION GRAPH
        pred = torch.exp(self.nnet(states)) ### DON'T WRAP PREDICTIONS IN A VARIABLE
#       #Variable(torch.max(actions.long(), 1)[1]),

        loss = self.loss(pred, actions, discounted_episode_rewards)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
#         a = list(self.nnet.parameters())[0].clone()
#         loss.backward()
#         optimizer.step()
#         b = list(self.nnet.parameters())[0].clone()
#         print(list(self.nnet.parameters())[0].grad)
        return loss
        
    def predict(self, state):
        state = torch.FloatTensor(state).contiguous().cuda()
        with torch.no_grad():
            state = Variable(state)
        self.nnet.eval()
        x = self.nnet(state)
        return torch.exp(x).data.cpu().numpy()
    
    def loss(self, pred, actions, discounted_episode_rewards):
        loss = F.cross_entropy(pred, actions, reduce=False)
        mean_loss = torch.mean(loss*discounted_episode_rewards)
        return mean_loss
'''
    def loss(self, states, actions, discounted_episode_rewards):
        neg_log_probs = F.cross_entropy(states, actions)
        return torch.mean(neg_log_probs * discounted_episode_rewards)
'''

'\n    def loss(self, states, actions, discounted_episode_rewards):\n        neg_log_probs = F.cross_entropy(states, actions)\n        return torch.mean(neg_log_probs * discounted_episode_rewards)\n'

In [7]:
nnet = NNet()

In [8]:
allRewards = []
total_rewards = 0
maximumRewardRecorded = 0
loss = 0
episode_states, episode_actions, episode_rewards = [],[],[]

for episode in range(max_episodes):
    episode_rewards_sum = 0
    state = env.reset()
    env.render()
    
    while True:
        pi = nnet.predict(state.reshape([1,4]))
#         print(pi)
        action = np.random.choice(range(pi.shape[1]), p=pi.ravel())
        new_state, reward, done, info = env.step(action)
        
        episode_states.append(state)
#         actions_ = np.zeros(action_size)
#         actions_[action] = 1
        episode_actions.append(action)
        episode_rewards.append(reward)
        if done:
            episode_rewards_sum = np.sum(episode_rewards)
            allRewards.append(episode_rewards_sum)
            total_rewards = np.sum(allRewards)
            mean_reward = np.divide(total_rewards, episode+1)
            maximumRewardRecorded = np.amax(allRewards)
            
            print("==========================================")
            print("Episode: ", episode)
            print(f"Loss: {loss}")
            print("Reward: ", episode_rewards_sum)
            print("Mean Reward", mean_reward)
            print("Max reward so far: ", maximumRewardRecorded)
            ###
#             print(np.vstack(np.array(episode_states)).shape)
#             print(np.vstack(np.array(episode_actions)).shape)
#             print(np.array(episode_actions).shape)
            discounted_episode_rewards = discount_and_normalize(episode_rewards)

            loss = nnet.train(np.vstack(np.array(episode_states)), np.array(episode_actions), discounted_episode_rewards)
            episode_states, episode_actions, episode_rewards = [],[],[]
            
            break
        state = new_state

Episode:  0
Loss: 0
Reward:  22.0
Mean Reward 22.0
Max reward so far:  22.0
Episode:  1
Loss: 0.01358126476407051
Reward:  25.0
Mean Reward 23.5
Max reward so far:  25.0
Episode:  2
Loss: -0.002757689915597439
Reward:  10.0
Mean Reward 19.0
Max reward so far:  25.0
Episode:  3
Loss: 0.022729426622390747
Reward:  11.0
Mean Reward 17.0
Max reward so far:  25.0
Episode:  4
Loss: -0.0614120252430439
Reward:  13.0
Mean Reward 16.2
Max reward so far:  25.0
Episode:  5
Loss: 0.04941382631659508
Reward:  12.0
Mean Reward 15.5
Max reward so far:  25.0
Episode:  6
Loss: -0.02353670634329319
Reward:  20.0
Mean Reward 16.142857142857142
Max reward so far:  25.0
Episode:  7
Loss: 0.006193923763930798
Reward:  10.0
Mean Reward 15.375
Max reward so far:  25.0
Episode:  8
Loss: -0.05598437786102295
Reward:  10.0
Mean Reward 14.777777777777779
Max reward so far:  25.0
Episode:  9
Loss: -0.001620602561160922
Reward:  12.0
Mean Reward 14.5
Max reward so far:  25.0
Episode:  10
Loss: 0.047970160841941833


Episode:  58
Loss: -0.004958071280270815
Reward:  68.0
Mean Reward 19.93220338983051
Max reward so far:  68.0
Episode:  59
Loss: -0.009539627470076084
Reward:  8.0
Mean Reward 19.733333333333334
Max reward so far:  68.0
Episode:  60
Loss: -0.022858411073684692
Reward:  27.0
Mean Reward 19.852459016393443
Max reward so far:  68.0
Episode:  61
Loss: 0.01916121132671833
Reward:  37.0
Mean Reward 20.129032258064516
Max reward so far:  68.0
Episode:  62
Loss: 0.010198920033872128
Reward:  13.0
Mean Reward 20.015873015873016
Max reward so far:  68.0
Episode:  63
Loss: -0.01630672998726368
Reward:  23.0
Mean Reward 20.0625
Max reward so far:  68.0
Episode:  64
Loss: 0.00840594433248043
Reward:  13.0
Mean Reward 19.953846153846154
Max reward so far:  68.0
Episode:  65
Loss: -0.003934025764465332
Reward:  27.0
Mean Reward 20.060606060606062
Max reward so far:  68.0
Episode:  66
Loss: 0.017958054319024086
Reward:  39.0
Mean Reward 20.34328358208955
Max reward so far:  68.0
Episode:  67
Loss: -0.

Episode:  113
Loss: 0.01903255470097065
Reward:  91.0
Mean Reward 28.24561403508772
Max reward so far:  137.0
Episode:  114
Loss: -0.008772052824497223
Reward:  43.0
Mean Reward 28.37391304347826
Max reward so far:  137.0
Episode:  115
Loss: 0.011818278580904007
Reward:  32.0
Mean Reward 28.405172413793103
Max reward so far:  137.0
Episode:  116
Loss: 0.0010452140122652054
Reward:  48.0
Mean Reward 28.572649572649574
Max reward so far:  137.0
Episode:  117
Loss: -0.006088194902986288
Reward:  71.0
Mean Reward 28.93220338983051
Max reward so far:  137.0
Episode:  118
Loss: 0.006199568044394255
Reward:  31.0
Mean Reward 28.949579831932773
Max reward so far:  137.0
Episode:  119
Loss: 0.011231048963963985
Reward:  64.0
Mean Reward 29.241666666666667
Max reward so far:  137.0
Episode:  120
Loss: -0.019121669232845306
Reward:  27.0
Mean Reward 29.223140495867767
Max reward so far:  137.0
Episode:  121
Loss: 0.00860515795648098
Reward:  21.0
Mean Reward 29.15573770491803
Max reward so far:  

Episode:  169
Loss: -0.0106104277074337
Reward:  85.0
Mean Reward 31.929411764705883
Max reward so far:  175.0
Episode:  170
Loss: -0.0013521832879632711
Reward:  25.0
Mean Reward 31.88888888888889
Max reward so far:  175.0
Episode:  171
Loss: -0.06617569923400879
Reward:  98.0
Mean Reward 32.27325581395349
Max reward so far:  175.0
Episode:  172
Loss: -0.009952486492693424
Reward:  46.0
Mean Reward 32.35260115606936
Max reward so far:  175.0
Episode:  173
Loss: -0.021930063143372536
Reward:  37.0
Mean Reward 32.37931034482759
Max reward so far:  175.0
Episode:  174
Loss: -0.032818976789712906
Reward:  28.0
Mean Reward 32.354285714285716
Max reward so far:  175.0
Episode:  175
Loss: 0.021002139896154404
Reward:  41.0
Mean Reward 32.40340909090909
Max reward so far:  175.0
Episode:  176
Loss: -0.010033699683845043
Reward:  65.0
Mean Reward 32.58757062146893
Max reward so far:  175.0
Episode:  177
Loss: -0.01905211992561817
Reward:  61.0
Mean Reward 32.747191011235955
Max reward so far: 

Episode:  224
Loss: -0.005548620130866766
Reward:  172.0
Mean Reward 53.87111111111111
Max reward so far:  297.0
Episode:  225
Loss: -0.03157515823841095
Reward:  27.0
Mean Reward 53.75221238938053
Max reward so far:  297.0
Episode:  226
Loss: -0.08197806775569916
Reward:  194.0
Mean Reward 54.370044052863435
Max reward so far:  297.0
Episode:  227
Loss: 0.004935957491397858
Reward:  33.0
Mean Reward 54.276315789473685
Max reward so far:  297.0
Episode:  228
Loss: -0.07000289857387543
Reward:  130.0
Mean Reward 54.60698689956332
Max reward so far:  297.0
Episode:  229
Loss: -0.011256309226155281
Reward:  128.0
Mean Reward 54.926086956521736
Max reward so far:  297.0
Episode:  230
Loss: -0.007489695679396391
Reward:  116.0
Mean Reward 55.19047619047619
Max reward so far:  297.0
Episode:  231
Loss: -0.03507419675588608
Reward:  234.0
Mean Reward 55.96120689655172
Max reward so far:  297.0
Episode:  232
Loss: -0.0031164914835244417
Reward:  183.0
Mean Reward 56.506437768240346
Max reward 

Episode:  277
Loss: -0.006930183619260788
Reward:  383.0
Mean Reward 82.92805755395683
Max reward so far:  490.0
Episode:  278
Loss: -0.02967851422727108
Reward:  235.0
Mean Reward 83.47311827956989
Max reward so far:  490.0
Episode:  279
Loss: 0.001090522506274283
Reward:  452.0
Mean Reward 84.78928571428571
Max reward so far:  490.0
Episode:  280
Loss: -0.00801617931574583
Reward:  130.0
Mean Reward 84.95017793594306
Max reward so far:  490.0
Episode:  281
Loss: 0.013517796993255615
Reward:  277.0
Mean Reward 85.63120567375887
Max reward so far:  490.0
Episode:  282
Loss: -0.029111115261912346
Reward:  251.0
Mean Reward 86.2155477031802
Max reward so far:  490.0
Episode:  283
Loss: -0.004127329681068659
Reward:  183.0
Mean Reward 86.55633802816901
Max reward so far:  490.0
Episode:  284
Loss: -0.01815207116305828
Reward:  688.0
Mean Reward 88.66666666666667
Max reward so far:  688.0
Episode:  285
Loss: -0.006575309205800295
Reward:  287.0
Mean Reward 89.36013986013987
Max reward so f

Episode:  330
Loss: -0.01127453614026308
Reward:  3215.0
Mean Reward 222.51057401812687
Max reward so far:  6502.0
Episode:  331
Loss: -0.009121300652623177
Reward:  26371.0
Mean Reward 301.2710843373494
Max reward so far:  26371.0
Episode:  332
Loss: -0.0025385981425642967
Reward:  488.0
Mean Reward 301.83183183183183
Max reward so far:  26371.0
Episode:  333
Loss: -0.010141734033823013
Reward:  868.0
Mean Reward 303.52694610778445
Max reward so far:  26371.0
Episode:  334
Loss: -0.007405403070151806
Reward:  228.0
Mean Reward 303.30149253731344
Max reward so far:  26371.0
Episode:  335
Loss: -0.01504573319107294
Reward:  366.0
Mean Reward 303.48809523809524
Max reward so far:  26371.0
Episode:  336
Loss: 0.0010189785389229655
Reward:  307.0
Mean Reward 303.4985163204748
Max reward so far:  26371.0
Episode:  337
Loss: -0.013880766928195953
Reward:  273.0
Mean Reward 303.4082840236686
Max reward so far:  26371.0
Episode:  338
Loss: -0.01253447774797678
Reward:  210.0
Mean Reward 303.13

Loss: 0.0009427641634829342
Reward:  72.0
Mean Reward 297.3289817232376
Max reward so far:  26371.0
Episode:  383
Loss: -0.010502598248422146
Reward:  94.0
Mean Reward 296.7994791666667
Max reward so far:  26371.0
Episode:  384
Loss: -0.002135056769475341
Reward:  118.0
Mean Reward 296.33506493506496
Max reward so far:  26371.0
Episode:  385
Loss: 0.0015462354058399796
Reward:  97.0
Mean Reward 295.81865284974094
Max reward so far:  26371.0
Episode:  386
Loss: 0.026671631261706352
Reward:  230.0
Mean Reward 295.64857881136953
Max reward so far:  26371.0
Episode:  387
Loss: 0.0005565806641243398
Reward:  149.0
Mean Reward 295.27061855670104
Max reward so far:  26371.0
Episode:  388
Loss: 0.006988097447901964
Reward:  324.0
Mean Reward 295.34447300771205
Max reward so far:  26371.0
Episode:  389
Loss: -0.01116881798952818
Reward:  125.0
Mean Reward 294.9076923076923
Max reward so far:  26371.0
Episode:  390
Loss: -0.01666419208049774
Reward:  177.0
Mean Reward 294.6061381074169
Max rewar

Episode:  435
Loss: -0.03216893970966339
Reward:  255.0
Mean Reward 293.25688073394497
Max reward so far:  26371.0
Episode:  436
Loss: -0.021641623228788376
Reward:  168.0
Mean Reward 292.97025171624716
Max reward so far:  26371.0
Episode:  437
Loss: -0.038102369755506516
Reward:  144.0
Mean Reward 292.63013698630135
Max reward so far:  26371.0
Episode:  438
Loss: -0.04677071049809456
Reward:  244.0
Mean Reward 292.51936218678816
Max reward so far:  26371.0
Episode:  439
Loss: -0.0336996428668499
Reward:  203.0
Mean Reward 292.3159090909091
Max reward so far:  26371.0
Episode:  440
Loss: -0.034407954663038254
Reward:  160.0
Mean Reward 292.015873015873
Max reward so far:  26371.0
Episode:  441
Loss: -0.06000373512506485
Reward:  160.0
Mean Reward 291.71719457013575
Max reward so far:  26371.0
Episode:  442
Loss: -0.05276457220315933
Reward:  153.0
Mean Reward 291.40406320541763
Max reward so far:  26371.0
Episode:  443
Loss: -0.04107041284441948
Reward:  139.0
Mean Reward 291.060810810

Episode:  488
Loss: 0.009788600727915764
Reward:  470.0
Mean Reward 290.27811860940693
Max reward so far:  26371.0
Episode:  489
Loss: -0.012782618403434753
Reward:  336.0
Mean Reward 290.37142857142857
Max reward so far:  26371.0
Episode:  490
Loss: -0.025053853169083595
Reward:  360.0
Mean Reward 290.51323828920573
Max reward so far:  26371.0
Episode:  491
Loss: -0.01344847958534956
Reward:  1035.0
Mean Reward 292.0264227642276
Max reward so far:  26371.0
Episode:  492
Loss: -0.01707065850496292
Reward:  597.0
Mean Reward 292.6450304259635
Max reward so far:  26371.0
Episode:  493
Loss: -0.02076641283929348
Reward:  492.0
Mean Reward 293.0485829959514
Max reward so far:  26371.0
Episode:  494
Loss: -0.02145175263285637
Reward:  319.0
Mean Reward 293.1010101010101
Max reward so far:  26371.0
Episode:  495
Loss: -0.04251907393336296
Reward:  992.0
Mean Reward 294.5100806451613
Max reward so far:  26371.0
Episode:  496
Loss: -0.02108650654554367
Reward:  701.0
Mean Reward 295.3279678068

Episode:  541
Loss: -0.053028613328933716
Reward:  155.0
Mean Reward 345.3874538745387
Max reward so far:  26371.0
Episode:  542
Loss: -0.0566723607480526
Reward:  207.0
Mean Reward 345.1325966850829
Max reward so far:  26371.0
Episode:  543
Loss: -0.0439375601708889
Reward:  131.0
Mean Reward 344.7389705882353
Max reward so far:  26371.0
Episode:  544
Loss: -0.05390481650829315
Reward:  127.0
Mean Reward 344.3394495412844
Max reward so far:  26371.0
Episode:  545
Loss: -0.04896227642893791
Reward:  171.0
Mean Reward 344.02197802197804
Max reward so far:  26371.0
Episode:  546
Loss: -0.05185457319021225
Reward:  115.0
Mean Reward 343.6032906764168
Max reward so far:  26371.0
Episode:  547
Loss: -0.06726395338773727
Reward:  120.0
Mean Reward 343.19525547445255
Max reward so far:  26371.0
Episode:  548
Loss: -0.0520189143717289
Reward:  122.0
Mean Reward 342.79234972677597
Max reward so far:  26371.0
Episode:  549
Loss: -0.05905977636575699
Reward:  142.0
Mean Reward 342.42727272727274


Episode:  595
Loss: -0.0017291574040427804
Reward:  300.0
Mean Reward 330.61073825503354
Max reward so far:  26371.0
Episode:  596
Loss: -0.002632005140185356
Reward:  423.0
Mean Reward 330.7654941373534
Max reward so far:  26371.0
Episode:  597
Loss: -0.002741786651313305
Reward:  215.0
Mean Reward 330.571906354515
Max reward so far:  26371.0
Episode:  598
Loss: 0.009275214746594429
Reward:  217.0
Mean Reward 330.3823038397329
Max reward so far:  26371.0
Episode:  599
Loss: 0.0014896896900609136
Reward:  168.0
Mean Reward 330.1116666666667
Max reward so far:  26371.0
Episode:  600
Loss: -0.058018434792757034
Reward:  118.0
Mean Reward 329.7587354409318
Max reward so far:  26371.0
Episode:  601
Loss: -0.07141109555959702
Reward:  159.0
Mean Reward 329.4750830564784
Max reward so far:  26371.0
Episode:  602
Loss: -0.08023910969495773
Reward:  254.0
Mean Reward 329.34991708126034
Max reward so far:  26371.0
Episode:  603
Loss: -0.012937693856656551
Reward:  158.0
Mean Reward 329.06622516

Episode:  648
Loss: -0.001938891364261508
Reward:  154.0
Mean Reward 321.3605546995378
Max reward so far:  26371.0
Episode:  649
Loss: 0.001956469612196088
Reward:  191.0
Mean Reward 321.16
Max reward so far:  26371.0
Episode:  650
Loss: -0.007872438058257103
Reward:  174.0
Mean Reward 320.93394777265746
Max reward so far:  26371.0
Episode:  651
Loss: -0.014397933147847652
Reward:  302.0
Mean Reward 320.9049079754601
Max reward so far:  26371.0
Episode:  652
Loss: 0.011205606162548065
Reward:  264.0
Mean Reward 320.8177641653905
Max reward so far:  26371.0
Episode:  653
Loss: 0.0014242383185774088
Reward:  295.0
Mean Reward 320.7782874617737
Max reward so far:  26371.0
Episode:  654
Loss: -0.006620047613978386
Reward:  239.0
Mean Reward 320.6534351145038
Max reward so far:  26371.0
Episode:  655
Loss: -0.021799730136990547
Reward:  136.0
Mean Reward 320.3719512195122
Max reward so far:  26371.0
Episode:  656
Loss: 0.008376635611057281
Reward:  123.0
Mean Reward 320.07153729071536
Max r

Episode:  700
Loss: -0.01826331950724125
Reward:  2613.0
Mean Reward 319.62054208273895
Max reward so far:  26371.0
Episode:  701
Loss: -0.009715431369841099
Reward:  167.0
Mean Reward 319.4031339031339
Max reward so far:  26371.0
Episode:  702
Loss: -0.008180647157132626
Reward:  196.0
Mean Reward 319.2275960170697
Max reward so far:  26371.0
Episode:  703
Loss: -0.03623812645673752
Reward:  172.0
Mean Reward 319.01846590909093
Max reward so far:  26371.0
Episode:  704
Loss: -0.026339899748563766
Reward:  157.0
Mean Reward 318.7886524822695
Max reward so far:  26371.0
Episode:  705
Loss: -0.02381052076816559
Reward:  162.0
Mean Reward 318.5665722379603
Max reward so far:  26371.0
Episode:  706
Loss: -0.0333397276699543
Reward:  159.0
Mean Reward 318.34087694483736
Max reward so far:  26371.0
Episode:  707
Loss: -0.02185109816491604
Reward:  209.0
Mean Reward 318.1864406779661
Max reward so far:  26371.0
Episode:  708
Loss: -0.022920681163668633
Reward:  166.0
Mean Reward 317.971791255

Episode:  752
Loss: -0.010251990519464016
Reward:  261.0
Mean Reward 405.47011952191235
Max reward so far:  26371.0
Episode:  753
Loss: -0.03903556242585182
Reward:  187.0
Mean Reward 405.18037135278513
Max reward so far:  26371.0
Episode:  754
Loss: -0.02272029034793377
Reward:  183.0
Mean Reward 404.88609271523177
Max reward so far:  26371.0
Episode:  755
Loss: -0.03111361153423786
Reward:  213.0
Mean Reward 404.63227513227514
Max reward so far:  26371.0
Episode:  756
Loss: -0.01883489452302456
Reward:  198.0
Mean Reward 404.35931307793925
Max reward so far:  26371.0
Episode:  757
Loss: -0.03870425745844841
Reward:  145.0
Mean Reward 404.01715039577834
Max reward so far:  26371.0
Episode:  758
Loss: -0.013783099129796028
Reward:  247.0
Mean Reward 403.8102766798419
Max reward so far:  26371.0
Episode:  759
Loss: -0.017403850331902504
Reward:  324.0
Mean Reward 403.70526315789476
Max reward so far:  26371.0
Episode:  760
Loss: -0.0015081303426995873
Reward:  259.0
Mean Reward 403.5151

Episode:  807
Loss: -0.041467152535915375
Reward:  105.0
Mean Reward 391.0
Max reward so far:  26371.0
Episode:  808
Loss: -0.0787762925028801
Reward:  139.0
Mean Reward 390.6885043263288
Max reward so far:  26371.0
Episode:  809
Loss: -0.05253950506448746
Reward:  169.0
Mean Reward 390.4148148148148
Max reward so far:  26371.0
Episode:  810
Loss: -0.04352441802620888
Reward:  85.0
Mean Reward 390.03822441430333
Max reward so far:  26371.0
Episode:  811
Loss: -0.04762963578104973
Reward:  117.0
Mean Reward 389.70197044334975
Max reward so far:  26371.0
Episode:  812
Loss: -0.047683171927928925
Reward:  97.0
Mean Reward 389.3419434194342
Max reward so far:  26371.0
Episode:  813
Loss: -0.051921047270298004
Reward:  173.0
Mean Reward 389.07616707616705
Max reward so far:  26371.0
Episode:  814
Loss: -0.0654880478978157
Reward:  114.0
Mean Reward 388.73865030674847
Max reward so far:  26371.0
Episode:  815
Loss: -0.06611485034227371
Reward:  96.0
Mean Reward 388.3799019607843
Max reward s

Episode:  861
Loss: -0.04343213140964508
Reward:  371.0
Mean Reward 376.9756380510441
Max reward so far:  26371.0
Episode:  862
Loss: -0.03900858014822006
Reward:  412.0
Mean Reward 377.0162224797219
Max reward so far:  26371.0
Episode:  863
Loss: -0.04466129094362259
Reward:  200.0
Mean Reward 376.8113425925926
Max reward so far:  26371.0
Episode:  864
Loss: -0.050870295614004135
Reward:  230.0
Mean Reward 376.64161849710985
Max reward so far:  26371.0
Episode:  865
Loss: -0.045640017837285995
Reward:  323.0
Mean Reward 376.5796766743649
Max reward so far:  26371.0
Episode:  866
Loss: -0.04670344665646553
Reward:  285.0
Mean Reward 376.47404844290656
Max reward so far:  26371.0
Episode:  867
Loss: -0.05148163065314293
Reward:  229.0
Mean Reward 376.3041474654378
Max reward so far:  26371.0
Episode:  868
Loss: -0.04997444897890091
Reward:  195.0
Mean Reward 376.0955120828539
Max reward so far:  26371.0
Episode:  869
Loss: -0.04443328455090523
Reward:  181.0
Mean Reward 375.871264367816

Loss: -0.0752701461315155
Reward:  149.0
Mean Reward 370.3825136612022
Max reward so far:  26371.0
Episode:  915
Loss: -0.07026412338018417
Reward:  181.0
Mean Reward 370.17576419213975
Max reward so far:  26371.0
Episode:  916
Loss: -0.073110431432724
Reward:  170.0
Mean Reward 369.9574700109051
Max reward so far:  26371.0
Episode:  917
Loss: -0.07357048988342285
Reward:  171.0
Mean Reward 369.74074074074076
Max reward so far:  26371.0
Episode:  918
Loss: -0.07138711959123611
Reward:  157.0
Mean Reward 369.5092491838955
Max reward so far:  26371.0
Episode:  919
Loss: -0.0778302326798439
Reward:  133.0
Mean Reward 369.25217391304346
Max reward so far:  26371.0
Episode:  920
Loss: -0.08457376807928085
Reward:  140.0
Mean Reward 369.00325732899023
Max reward so far:  26371.0
Episode:  921
Loss: -0.08608877658843994
Reward:  174.0
Mean Reward 368.79175704989154
Max reward so far:  26371.0
Episode:  922
Loss: -0.08828503638505936
Reward:  157.0
Mean Reward 368.56229685807153
Max reward so 

Episode:  969
Loss: -0.07261215150356293
Reward:  141.0
Mean Reward 360.21958762886595
Max reward so far:  26371.0
Episode:  970
Loss: -0.08167921751737595
Reward:  177.0
Mean Reward 360.03089598352216
Max reward so far:  26371.0
Episode:  971
Loss: -0.08490487188100815
Reward:  183.0
Mean Reward 359.8487654320988
Max reward so far:  26371.0
Episode:  972
Loss: -0.07920881360769272
Reward:  145.0
Mean Reward 359.6279547790339
Max reward so far:  26371.0
Episode:  973
Loss: -0.09074068814516068
Reward:  188.0
Mean Reward 359.4517453798768
Max reward so far:  26371.0
Episode:  974
Loss: -0.08419723808765411
Reward:  159.0
Mean Reward 359.24615384615385
Max reward so far:  26371.0
Episode:  975
Loss: -0.10703631490468979
Reward:  235.0
Mean Reward 359.1188524590164
Max reward so far:  26371.0
Episode:  976
Loss: -0.06103651598095894
Reward:  216.0
Mean Reward 358.9723643807574
Max reward so far:  26371.0
Episode:  977
Loss: -0.04407772794365883
Reward:  178.0
Mean Reward 358.7873210633947

Episode:  1021
Loss: -0.04870959371328354
Reward:  250.0
Mean Reward 353.3825831702544
Max reward so far:  26371.0
Episode:  1022
Loss: -0.02841530367732048
Reward:  188.0
Mean Reward 353.2209188660802
Max reward so far:  26371.0
Episode:  1023
Loss: -0.07779958099126816
Reward:  268.0
Mean Reward 353.1376953125
Max reward so far:  26371.0
Episode:  1024
Loss: -0.07738585770130157
Reward:  307.0
Mean Reward 353.09268292682924
Max reward so far:  26371.0
Episode:  1025
Loss: -0.05118979513645172
Reward:  376.0
Mean Reward 353.11500974658867
Max reward so far:  26371.0
Episode:  1026
Loss: -0.0540560707449913
Reward:  10855.0
Mean Reward 363.3407984420643
Max reward so far:  26371.0
Episode:  1027
Loss: -0.010490866377949715
Reward:  20315.0
Mean Reward 382.7490272373541
Max reward so far:  26371.0


KeyboardInterrupt: 

input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
print(input.shape, target.shape)

In [9]:
for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("*****************")
    print("Episode ", episode)
    
    while True:
        pi = nnet.predict(state)
        action = np.argmax(pi)
        new_state, reward, done, info = env.step(action)
        
        if done:
            print("Reward: ", reward)
            print("Info: ", info)
            break
        state = new_state
env.close()

*****************
Episode  0


KeyboardInterrupt: 

In [None]:
state = env.reset()
print(state)