In [1]:
# Spring 2022, IOC 5259 Reinforcement Learning
# HW1-partII: REINFORCE and baseline

import gym
from itertools import count
from collections import namedtuple
import numpy as np
from numpy import sqrt 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import torch.optim.lr_scheduler as Scheduler


# Define a useful tuple (optional)
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

        
class Policy(nn.Module):
    """
        Implement both policy network and the value network in one model
        - Note that here we let the actor and value networks share the first layer
        - Feel free to change the architecture (e.g. number of hidden layers and the width of each hidden layer) as you like
        - Feel free to add any member variables/functions whenever needed
        TODO:
            1. Initialize the network (including the shared layer(s), the action layer(s), and the value layer(s)
            2. Random weight initialization of each layer
    """
    def __init__(self):
        super(Policy, self).__init__()
        
        # Extract the dimensionality of state and action spaces
        self.discrete = isinstance(env.action_space, gym.spaces.Discrete)
        self.observation_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n if self.discrete else env.action_space.shape[0]
        self.hidden_size = 128
#         print(self.observation_dim )
#         print(self.action_dim )
        ########## YOUR CODE HERE (5~10 lines) ##########
        self.s_layer1= nn.Linear(4, self.hidden_size)
        torch.nn.init.xavier_uniform_(self.s_layer1.weight)
                
        self.s_layer2 = nn.Linear(self.hidden_size, self.hidden_size)
        torch.nn.init.xavier_uniform_(self.s_layer2.weight)
        
        self.a_layer = nn.Linear(self.hidden_size, 2)
        torch.nn.init.xavier_uniform_(self.a_layer.weight)

        
        self.v_layer = nn.Linear(self.hidden_size, 1)
        torch.nn.init.xavier_uniform_(self.v_layer.weight)

        ########## END OF YOUR CODE ##########
        
        # action & reward memory
        self.saved_actions = []
        self.rewards = []

    def forward(self, state):
        """
            Forward pass of both policy and value networks
            - The input is the state, and the outputs are the corresponding 
              action probability distirbution and the state value
            TODO:
                1. Implement the forward pass for both the action and the state value
        """
        
        ########## YOUR CODE HERE (3~5 lines) ##########
        x =  self.s_layer1(state)
        x = F.relu(x)
        x =  self.s_layer2(x)
        x = F.relu(x)
        y=self.a_layer(x)
        action_prob = F.softmax(y, dim=-1)  #choses what action you want to take and return probability of each action
        state_value = self.v_layer(x)
         
        # return values are  tuple, action_prob:  the probability of each action over the action space
                                          #  state_value: the value from state s_t 

        ########## END OF YOUR CODE ##########

        return action_prob, state_value




    def select_action(self, state):
        """
            Select the action given the current state
            - The input is the state, and the output is the action to apply 
            (based on the learned stochastic policy)
            TODO:
                1. Implement the forward pass for both the action and the state value
        """
        
        ########## YOUR CODE HERE (3~5 lines) ##########
        state = torch.from_numpy(state).float().squeeze(0)
        action_prob, state_value = self.forward(state)
        m = Categorical(action_prob)  #create a distribution from action_prob
        action = m.sample()    #sample the action

        ########## END OF YOUR CODE ##########
        
        # save to action buffer
        self.saved_actions.append(SavedAction(m.log_prob(action), state_value))

        return action.item()    


    def calculate_loss(self, gamma=0.99):
        """
            Calculate the loss (= policy loss + value loss) to perform backprop later
            TODO:
                1. Calculate rewards-to-go required by REINFORCE with the help of self.rewards
                2. Calculate the policy loss using the policy gradient
                3. Calculate the value loss using either MSE loss or smooth L1 loss
        """
        
        # Initialize the lists and variables
        R = 0
        saved_actions = self.saved_actions
        policy_losses = [] 
        value_losses = [] 
        returns = []

        ########## YOUR CODE HERE (8-15 lines) ##########
        eps = np.finfo(np.float32).eps.item()

        for r in reversed(self.rewards):              # calculate discounted value
            R = r + gamma * R
            returns.insert(0, R)
        
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)  #standardlization for faster converge / eps: To increase the stability when calculating
        
        
        for (log_prob, state_value),R in zip(saved_actions , returns):
            policy_losses.append(R* -log_prob )
            
            value_losses.append( F.mse_loss(state_value, torch.tensor ([R]) ) ) # calcilate loss using MSE
            
        loss = torch.stack(policy_losses).sum()+torch.stack(value_losses).sum()  #sum up both policy_losses and value_losses

        ########## END OF YOUR CODE ##########
        
        return loss

    def clear_memory(self):
        # reset rewards and action buffer
        del self.rewards[:]
        del self.saved_actions[:]


def train(lr=0.01):
    '''
        Train the model using SGD (via backpropagation)
        TODO: In each episode, 
        1. run the policy till the end of the episode and keep the sampled trajectory
        2. update both the policy and the value network at the end of episode
    '''    
    
    # Instantiate the policy model and the optimizer
    model = Policy()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Learning rate scheduler (optional)
    scheduler = Scheduler.StepLR(optimizer, step_size=100, gamma=0.9)
    
    # EWMA reward for tracking the learning progress
    ewma_reward = 0
    
    # run inifinitely many episodes
    for i_episode in count(1):
        # reset environment and episode reward
        state = env.reset()
        ep_reward = 0
        t = 0
        # Uncomment the following line to use learning rate scheduler
        #scheduler.step()
        
        # For each episode, only run 9999 steps so that we don't 
        # infinite loop while learning
        
        ########## YOUR CODE HERE (10-15 lines) ##########
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = model.select_action(state)       # select action
            s_state, reward, done, _ = env.step(action)

            model.rewards.append(reward)
            ep_reward += reward
            if done:
                break
            state = s_state

        # Backpropagation
        optimizer.zero_grad()     # to reset gradients
        policy_loss = model.calculate_loss()
        policy_loss.backward()
        optimizer.step()
        model.clear_memory()
        
        ########## END OF YOUR CODE ##########
            
        # update EWMA reward and log the results
        ewma_reward = 0.05 * ep_reward + (1 - 0.05) * ewma_reward
        print('Episode {}\tlength: {}\treward: {}\t ewma reward: {}'.format(i_episode, t, ep_reward, ewma_reward))

        # check if we have "solved" the cart pole problem
        if ewma_reward > env.spec.reward_threshold:
            torch.save(model.state_dict(), './preTrained/CartPole_{}.pth'.format(lr))
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(ewma_reward, t))
            break


def test(name, n_episodes=10):
    '''
        Test the learned model (no change needed)
    '''      
    model = Policy()
    
    model.load_state_dict(torch.load('./preTrained/{}'.format(name)))
    
    render = True
    max_episode_len = 10000
    
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        running_reward = 0
        for t in range(max_episode_len+1):
            action = model.select_action(state)
            state, reward, done, _ = env.step(action)
            running_reward += reward
            if render:
                 env.render()
            if done:
                break
        print('Episode {}\tReward: {}'.format(i_episode, running_reward))
    env.close()
    

if __name__ == '__main__':
    # For reproducibility, fix the random seed
    random_seed = 20  
    lr = 0.01
    env = gym.make("CartPole-v0")
    env.seed(random_seed)  
    torch.manual_seed(random_seed)  
    train(lr)
    test('CartPole_0.01.pth')




  logger.warn(


Episode 1	length: 10	reward: 10.0	 ewma reward: 0.5
Episode 2	length: 10	reward: 10.0	 ewma reward: 0.975
Episode 3	length: 30	reward: 30.0	 ewma reward: 2.42625
Episode 4	length: 14	reward: 14.0	 ewma reward: 3.0049375
Episode 5	length: 9	reward: 9.0	 ewma reward: 3.304690625
Episode 6	length: 14	reward: 14.0	 ewma reward: 3.83945609375
Episode 7	length: 15	reward: 15.0	 ewma reward: 4.3974832890624995
Episode 8	length: 11	reward: 11.0	 ewma reward: 4.7276091246093745
Episode 9	length: 14	reward: 14.0	 ewma reward: 5.191228668378906
Episode 10	length: 14	reward: 14.0	 ewma reward: 5.63166723495996
Episode 11	length: 19	reward: 19.0	 ewma reward: 6.300083873211962
Episode 12	length: 23	reward: 23.0	 ewma reward: 7.135079679551364
Episode 13	length: 19	reward: 19.0	 ewma reward: 7.728325695573796
Episode 14	length: 22	reward: 22.0	 ewma reward: 8.441909410795105
Episode 15	length: 31	reward: 31.0	 ewma reward: 9.56981394025535
Episode 16	length: 22	reward: 22.0	 ewma reward: 10.19132324

Episode 127	length: 70	reward: 70.0	 ewma reward: 42.12981548280884
Episode 128	length: 50	reward: 50.0	 ewma reward: 42.52332470866839
Episode 129	length: 32	reward: 32.0	 ewma reward: 41.99715847323497
Episode 130	length: 96	reward: 96.0	 ewma reward: 44.697300549573214
Episode 131	length: 12	reward: 12.0	 ewma reward: 43.06243552209455
Episode 132	length: 106	reward: 106.0	 ewma reward: 46.20931374598982
Episode 133	length: 83	reward: 83.0	 ewma reward: 48.048848058690325
Episode 134	length: 65	reward: 65.0	 ewma reward: 48.89640565575581
Episode 135	length: 89	reward: 89.0	 ewma reward: 50.901585372968015
Episode 136	length: 53	reward: 53.0	 ewma reward: 51.00650610431961
Episode 137	length: 46	reward: 46.0	 ewma reward: 50.756180799103625
Episode 138	length: 18	reward: 18.0	 ewma reward: 49.11837175914844
Episode 139	length: 28	reward: 28.0	 ewma reward: 48.06245317119101
Episode 140	length: 27	reward: 27.0	 ewma reward: 47.00933051263146
Episode 141	length: 18	reward: 18.0	 ewma 

Episode 247	length: 174	reward: 174.0	 ewma reward: 94.91645064938234
Episode 248	length: 150	reward: 150.0	 ewma reward: 97.67062811691322
Episode 249	length: 191	reward: 191.0	 ewma reward: 102.33709671106756
Episode 250	length: 179	reward: 179.0	 ewma reward: 106.17024187551418
Episode 251	length: 140	reward: 140.0	 ewma reward: 107.86172978173846
Episode 252	length: 134	reward: 134.0	 ewma reward: 109.16864329265154
Episode 253	length: 13	reward: 13.0	 ewma reward: 104.36021112801896
Episode 254	length: 110	reward: 110.0	 ewma reward: 104.642200571618
Episode 255	length: 147	reward: 147.0	 ewma reward: 106.7600905430371
Episode 256	length: 98	reward: 98.0	 ewma reward: 106.32208601588525
Episode 257	length: 108	reward: 108.0	 ewma reward: 106.40598171509099
Episode 258	length: 200	reward: 200.0	 ewma reward: 111.08568262933643
Episode 259	length: 200	reward: 200.0	 ewma reward: 115.5313984978696
Episode 260	length: 131	reward: 131.0	 ewma reward: 116.30482857297612
Episode 261	leng

Episode 364	length: 200	reward: 200.0	 ewma reward: 165.35232466853273
Episode 365	length: 200	reward: 200.0	 ewma reward: 167.08470843510608
Episode 366	length: 165	reward: 165.0	 ewma reward: 166.98047301335077
Episode 367	length: 160	reward: 160.0	 ewma reward: 166.63144936268324
Episode 368	length: 200	reward: 200.0	 ewma reward: 168.29987689454907
Episode 369	length: 167	reward: 167.0	 ewma reward: 168.2348830498216
Episode 370	length: 127	reward: 127.0	 ewma reward: 166.17313889733052
Episode 371	length: 169	reward: 169.0	 ewma reward: 166.31448195246398
Episode 372	length: 100	reward: 100.0	 ewma reward: 162.99875785484076
Episode 373	length: 55	reward: 55.0	 ewma reward: 157.5988199620987
Episode 374	length: 126	reward: 126.0	 ewma reward: 156.01887896399379
Episode 375	length: 133	reward: 133.0	 ewma reward: 154.8679350157941
Episode 376	length: 185	reward: 185.0	 ewma reward: 156.3745382650044
Episode 377	length: 155	reward: 155.0	 ewma reward: 156.30581135175416
Episode 378	

Episode 483	length: 91	reward: 91.0	 ewma reward: 124.19330376280864
Episode 484	length: 107	reward: 107.0	 ewma reward: 123.3336385746682
Episode 485	length: 134	reward: 134.0	 ewma reward: 123.86695664593479
Episode 486	length: 110	reward: 110.0	 ewma reward: 123.17360881363804
Episode 487	length: 92	reward: 92.0	 ewma reward: 121.61492837295613
Episode 488	length: 20	reward: 20.0	 ewma reward: 116.53418195430832
Episode 489	length: 89	reward: 89.0	 ewma reward: 115.1574728565929
Episode 490	length: 82	reward: 82.0	 ewma reward: 113.49959921376325
Episode 491	length: 72	reward: 72.0	 ewma reward: 111.42461925307508
Episode 492	length: 80	reward: 80.0	 ewma reward: 109.85338829042132
Episode 493	length: 20	reward: 20.0	 ewma reward: 105.36071887590025
Episode 494	length: 88	reward: 88.0	 ewma reward: 104.49268293210524
Episode 495	length: 37	reward: 37.0	 ewma reward: 101.11804878549997
Episode 496	length: 14	reward: 14.0	 ewma reward: 96.76214634622497
Episode 497	length: 71	reward: 

Episode 602	length: 126	reward: 126.0	 ewma reward: 129.89343409304507
Episode 603	length: 112	reward: 112.0	 ewma reward: 128.9987623883928
Episode 604	length: 120	reward: 120.0	 ewma reward: 128.54882426897316
Episode 605	length: 100	reward: 100.0	 ewma reward: 127.1213830555245
Episode 606	length: 29	reward: 29.0	 ewma reward: 122.21531390274828
Episode 607	length: 107	reward: 107.0	 ewma reward: 121.45454820761086
Episode 608	length: 106	reward: 106.0	 ewma reward: 120.6818207972303
Episode 609	length: 112	reward: 112.0	 ewma reward: 120.24772975736877
Episode 610	length: 27	reward: 27.0	 ewma reward: 115.58534326950033
Episode 611	length: 113	reward: 113.0	 ewma reward: 115.45607610602531
Episode 612	length: 100	reward: 100.0	 ewma reward: 114.68327230072404
Episode 613	length: 103	reward: 103.0	 ewma reward: 114.09910868568784
Episode 614	length: 107	reward: 107.0	 ewma reward: 113.74415325140343
Episode 615	length: 104	reward: 104.0	 ewma reward: 113.25694558883326
Episode 616	l

Episode 722	length: 81	reward: 81.0	 ewma reward: 75.89410883680738
Episode 723	length: 83	reward: 83.0	 ewma reward: 76.249403394967
Episode 724	length: 75	reward: 75.0	 ewma reward: 76.18693322521865
Episode 725	length: 63	reward: 63.0	 ewma reward: 75.52758656395773
Episode 726	length: 60	reward: 60.0	 ewma reward: 74.75120723575984
Episode 727	length: 71	reward: 71.0	 ewma reward: 74.56364687397183
Episode 728	length: 66	reward: 66.0	 ewma reward: 74.13546453027324
Episode 729	length: 66	reward: 66.0	 ewma reward: 73.72869130375958
Episode 730	length: 64	reward: 64.0	 ewma reward: 73.2422567385716
Episode 731	length: 70	reward: 70.0	 ewma reward: 73.08014390164301
Episode 732	length: 71	reward: 71.0	 ewma reward: 72.97613670656085
Episode 733	length: 71	reward: 71.0	 ewma reward: 72.8773298712328
Episode 734	length: 71	reward: 71.0	 ewma reward: 72.78346337767115
Episode 735	length: 70	reward: 70.0	 ewma reward: 72.6442902087876
Episode 736	length: 63	reward: 63.0	 ewma reward: 72.

Episode 843	length: 56	reward: 56.0	 ewma reward: 86.3701254965434
Episode 844	length: 67	reward: 67.0	 ewma reward: 85.40161922171622
Episode 845	length: 50	reward: 50.0	 ewma reward: 83.6315382606304
Episode 846	length: 55	reward: 55.0	 ewma reward: 82.19996134759889
Episode 847	length: 56	reward: 56.0	 ewma reward: 80.88996328021894
Episode 848	length: 59	reward: 59.0	 ewma reward: 79.79546511620799
Episode 849	length: 42	reward: 42.0	 ewma reward: 77.90569186039758
Episode 850	length: 53	reward: 53.0	 ewma reward: 76.6604072673777
Episode 851	length: 38	reward: 38.0	 ewma reward: 74.72738690400882
Episode 852	length: 40	reward: 40.0	 ewma reward: 72.99101755880838
Episode 853	length: 41	reward: 41.0	 ewma reward: 71.39146668086795
Episode 854	length: 33	reward: 33.0	 ewma reward: 69.47189334682456
Episode 855	length: 51	reward: 51.0	 ewma reward: 68.54829867948332
Episode 856	length: 35	reward: 35.0	 ewma reward: 66.87088374550916
Episode 857	length: 38	reward: 38.0	 ewma reward: 6

Episode 964	length: 50	reward: 50.0	 ewma reward: 52.920257330207754
Episode 965	length: 36	reward: 36.0	 ewma reward: 52.07424446369736
Episode 966	length: 46	reward: 46.0	 ewma reward: 51.77053224051249
Episode 967	length: 44	reward: 44.0	 ewma reward: 51.382005628486866
Episode 968	length: 43	reward: 43.0	 ewma reward: 50.96290534706252
Episode 969	length: 43	reward: 43.0	 ewma reward: 50.56476007970939
Episode 970	length: 68	reward: 68.0	 ewma reward: 51.436522075723914
Episode 971	length: 57	reward: 57.0	 ewma reward: 51.714695971937715
Episode 972	length: 60	reward: 60.0	 ewma reward: 52.12896117334083
Episode 973	length: 57	reward: 57.0	 ewma reward: 52.37251311467379
Episode 974	length: 43	reward: 43.0	 ewma reward: 51.90388745894009
Episode 975	length: 41	reward: 41.0	 ewma reward: 51.35869308599308
Episode 976	length: 43	reward: 43.0	 ewma reward: 50.940758431693425
Episode 977	length: 63	reward: 63.0	 ewma reward: 51.54372051010875
Episode 978	length: 59	reward: 59.0	 ewma r

Episode 1086	length: 42	reward: 42.0	 ewma reward: 52.903308844578454
Episode 1087	length: 41	reward: 41.0	 ewma reward: 52.30814340234953
Episode 1088	length: 57	reward: 57.0	 ewma reward: 52.54273623223205
Episode 1089	length: 42	reward: 42.0	 ewma reward: 52.01559942062045
Episode 1090	length: 49	reward: 49.0	 ewma reward: 51.864819449589426
Episode 1091	length: 59	reward: 59.0	 ewma reward: 52.22157847710996
Episode 1092	length: 65	reward: 65.0	 ewma reward: 52.860499553254456
Episode 1093	length: 114	reward: 114.0	 ewma reward: 55.91747457559173
Episode 1094	length: 38	reward: 38.0	 ewma reward: 55.02160084681214
Episode 1095	length: 46	reward: 46.0	 ewma reward: 54.57052080447153
Episode 1096	length: 37	reward: 37.0	 ewma reward: 53.69199476424795
Episode 1097	length: 21	reward: 21.0	 ewma reward: 52.05739502603555
Episode 1098	length: 52	reward: 52.0	 ewma reward: 52.05452527473377
Episode 1099	length: 45	reward: 45.0	 ewma reward: 51.70179901099708
Episode 1100	length: 30	rewar

Episode 1206	length: 86	reward: 86.0	 ewma reward: 95.99032064087555
Episode 1207	length: 126	reward: 126.0	 ewma reward: 97.49080460883177
Episode 1208	length: 200	reward: 200.0	 ewma reward: 102.61626437839017
Episode 1209	length: 71	reward: 71.0	 ewma reward: 101.03545115947065
Episode 1210	length: 134	reward: 134.0	 ewma reward: 102.68367860149712
Episode 1211	length: 122	reward: 122.0	 ewma reward: 103.64949467142226
Episode 1212	length: 91	reward: 91.0	 ewma reward: 103.01701993785115
Episode 1213	length: 33	reward: 33.0	 ewma reward: 99.5161689409586
Episode 1214	length: 153	reward: 153.0	 ewma reward: 102.19036049391066
Episode 1215	length: 71	reward: 71.0	 ewma reward: 100.63084246921512
Episode 1216	length: 200	reward: 200.0	 ewma reward: 105.59930034575436
Episode 1217	length: 200	reward: 200.0	 ewma reward: 110.31933532846664
Episode 1218	length: 200	reward: 200.0	 ewma reward: 114.8033685620433
Episode 1219	length: 95	reward: 95.0	 ewma reward: 113.81320013394114
Episode 1

Episode 1322	length: 200	reward: 200.0	 ewma reward: 135.74613186960545
Episode 1323	length: 200	reward: 200.0	 ewma reward: 138.95882527612517
Episode 1324	length: 200	reward: 200.0	 ewma reward: 142.0108840123189
Episode 1325	length: 149	reward: 149.0	 ewma reward: 142.36033981170294
Episode 1326	length: 156	reward: 156.0	 ewma reward: 143.0423228211178
Episode 1327	length: 200	reward: 200.0	 ewma reward: 145.8902066800619
Episode 1328	length: 200	reward: 200.0	 ewma reward: 148.59569634605882
Episode 1329	length: 187	reward: 187.0	 ewma reward: 150.51591152875585
Episode 1330	length: 200	reward: 200.0	 ewma reward: 152.99011595231806
Episode 1331	length: 200	reward: 200.0	 ewma reward: 155.34061015470215
Episode 1332	length: 178	reward: 178.0	 ewma reward: 156.47357964696704
Episode 1333	length: 165	reward: 165.0	 ewma reward: 156.89990066461868
Episode 1334	length: 200	reward: 200.0	 ewma reward: 159.05490563138775
Episode 1335	length: 200	reward: 200.0	 ewma reward: 161.1021603498

Episode 1438	length: 151	reward: 151.0	 ewma reward: 104.90644883414791
Episode 1439	length: 180	reward: 180.0	 ewma reward: 108.66112639244051
Episode 1440	length: 183	reward: 183.0	 ewma reward: 112.37807007281849
Episode 1441	length: 169	reward: 169.0	 ewma reward: 115.20916656917755
Episode 1442	length: 24	reward: 24.0	 ewma reward: 110.64870824071868
Episode 1443	length: 175	reward: 175.0	 ewma reward: 113.86627282868274
Episode 1444	length: 20	reward: 20.0	 ewma reward: 109.1729591872486
Episode 1445	length: 156	reward: 156.0	 ewma reward: 111.51431122788615
Episode 1446	length: 184	reward: 184.0	 ewma reward: 115.13859566649184
Episode 1447	length: 141	reward: 141.0	 ewma reward: 116.43166588316724
Episode 1448	length: 15	reward: 15.0	 ewma reward: 111.36008258900887
Episode 1449	length: 44	reward: 44.0	 ewma reward: 107.99207845955843
Episode 1450	length: 13	reward: 13.0	 ewma reward: 103.2424745365805
Episode 1451	length: 18	reward: 18.0	 ewma reward: 98.98035080975149
Episode

Episode 1557	length: 44	reward: 44.0	 ewma reward: 40.44101674012137
Episode 1558	length: 25	reward: 25.0	 ewma reward: 39.6689659031153
Episode 1559	length: 16	reward: 16.0	 ewma reward: 38.48551760795953
Episode 1560	length: 37	reward: 37.0	 ewma reward: 38.41124172756155
Episode 1561	length: 32	reward: 32.0	 ewma reward: 38.09067964118348
Episode 1562	length: 31	reward: 31.0	 ewma reward: 37.7361456591243
Episode 1563	length: 33	reward: 33.0	 ewma reward: 37.499338376168076
Episode 1564	length: 55	reward: 55.0	 ewma reward: 38.37437145735967
Episode 1565	length: 30	reward: 30.0	 ewma reward: 37.955652884491684
Episode 1566	length: 56	reward: 56.0	 ewma reward: 38.8578702402671
Episode 1567	length: 41	reward: 41.0	 ewma reward: 38.96497672825374
Episode 1568	length: 65	reward: 65.0	 ewma reward: 40.26672789184105
Episode 1569	length: 70	reward: 70.0	 ewma reward: 41.753391497249
Episode 1570	length: 12	reward: 12.0	 ewma reward: 40.26572192238655
Episode 1571	length: 68	reward: 68.0	

Episode 1676	length: 75	reward: 75.0	 ewma reward: 77.87737929734355
Episode 1677	length: 97	reward: 97.0	 ewma reward: 78.83351033247637
Episode 1678	length: 100	reward: 100.0	 ewma reward: 79.89183481585255
Episode 1679	length: 51	reward: 51.0	 ewma reward: 78.44724307505992
Episode 1680	length: 48	reward: 48.0	 ewma reward: 76.92488092130692
Episode 1681	length: 78	reward: 78.0	 ewma reward: 76.97863687524158
Episode 1682	length: 84	reward: 84.0	 ewma reward: 77.32970503147949
Episode 1683	length: 50	reward: 50.0	 ewma reward: 75.96321977990551
Episode 1684	length: 51	reward: 51.0	 ewma reward: 74.71505879091023
Episode 1685	length: 46	reward: 46.0	 ewma reward: 73.27930585136471
Episode 1686	length: 106	reward: 106.0	 ewma reward: 74.91534055879647
Episode 1687	length: 51	reward: 51.0	 ewma reward: 73.71957353085665
Episode 1688	length: 69	reward: 69.0	 ewma reward: 73.48359485431382
Episode 1689	length: 61	reward: 61.0	 ewma reward: 72.85941511159812
Episode 1690	length: 67	reward

Episode 1794	length: 45	reward: 45.0	 ewma reward: 50.9982569385909
Episode 1795	length: 49	reward: 49.0	 ewma reward: 50.89834409166136
Episode 1796	length: 48	reward: 48.0	 ewma reward: 50.753426887078284
Episode 1797	length: 45	reward: 45.0	 ewma reward: 50.46575554272437
Episode 1798	length: 50	reward: 50.0	 ewma reward: 50.44246776558815
Episode 1799	length: 78	reward: 78.0	 ewma reward: 51.820344377308736
Episode 1800	length: 45	reward: 45.0	 ewma reward: 51.4793271584433
Episode 1801	length: 45	reward: 45.0	 ewma reward: 51.15536080052113
Episode 1802	length: 36	reward: 36.0	 ewma reward: 50.397592760495066
Episode 1803	length: 50	reward: 50.0	 ewma reward: 50.37771312247031
Episode 1804	length: 69	reward: 69.0	 ewma reward: 51.3088274663468
Episode 1805	length: 56	reward: 56.0	 ewma reward: 51.543386093029454
Episode 1806	length: 98	reward: 98.0	 ewma reward: 53.866216788377976
Episode 1807	length: 59	reward: 59.0	 ewma reward: 54.12290594895908
Episode 1808	length: 69	reward: 

Episode 1910	length: 200	reward: 200.0	 ewma reward: 171.21205461649427
Episode 1911	length: 200	reward: 200.0	 ewma reward: 172.65145188566956
Episode 1912	length: 200	reward: 200.0	 ewma reward: 174.01887929138607
Episode 1913	length: 200	reward: 200.0	 ewma reward: 175.31793532681675
Episode 1914	length: 200	reward: 200.0	 ewma reward: 176.5520385604759
Episode 1915	length: 200	reward: 200.0	 ewma reward: 177.7244366324521
Episode 1916	length: 200	reward: 200.0	 ewma reward: 178.8382148008295
Episode 1917	length: 200	reward: 200.0	 ewma reward: 179.896304060788
Episode 1918	length: 200	reward: 200.0	 ewma reward: 180.9014888577486
Episode 1919	length: 200	reward: 200.0	 ewma reward: 181.85641441486118
Episode 1920	length: 200	reward: 200.0	 ewma reward: 182.7635936941181
Episode 1921	length: 200	reward: 200.0	 ewma reward: 183.6254140094122
Episode 1922	length: 200	reward: 200.0	 ewma reward: 184.4441433089416
Episode 1923	length: 66	reward: 66.0	 ewma reward: 178.52193614349451
Epi

Episode 2025	length: 191	reward: 191.0	 ewma reward: 161.6904991826717
Episode 2026	length: 200	reward: 200.0	 ewma reward: 163.60597422353808
Episode 2027	length: 198	reward: 198.0	 ewma reward: 165.3256755123612
Episode 2028	length: 200	reward: 200.0	 ewma reward: 167.05939173674312
Episode 2029	length: 200	reward: 200.0	 ewma reward: 168.70642214990596
Episode 2030	length: 200	reward: 200.0	 ewma reward: 170.27110104241066
Episode 2031	length: 200	reward: 200.0	 ewma reward: 171.7575459902901
Episode 2032	length: 200	reward: 200.0	 ewma reward: 173.1696686907756
Episode 2033	length: 200	reward: 200.0	 ewma reward: 174.51118525623681
Episode 2034	length: 200	reward: 200.0	 ewma reward: 175.78562599342496
Episode 2035	length: 200	reward: 200.0	 ewma reward: 176.9963446937537
Episode 2036	length: 200	reward: 200.0	 ewma reward: 178.14652745906602
Episode 2037	length: 172	reward: 172.0	 ewma reward: 177.8392010861127
Episode 2038	length: 200	reward: 200.0	 ewma reward: 178.9472410318070

Episode 2141	length: 122	reward: 122.0	 ewma reward: 91.82164479315584
Episode 2142	length: 13	reward: 13.0	 ewma reward: 87.88056255349805
Episode 2143	length: 14	reward: 14.0	 ewma reward: 84.18653442582315
Episode 2144	length: 132	reward: 132.0	 ewma reward: 86.57720770453199
Episode 2145	length: 14	reward: 14.0	 ewma reward: 82.94834731930538
Episode 2146	length: 25	reward: 25.0	 ewma reward: 80.05092995334012
Episode 2147	length: 23	reward: 23.0	 ewma reward: 77.19838345567311
Episode 2148	length: 13	reward: 13.0	 ewma reward: 73.98846428288945
Episode 2149	length: 14	reward: 14.0	 ewma reward: 70.98904106874498
Episode 2150	length: 17	reward: 17.0	 ewma reward: 68.28958901530773
Episode 2151	length: 133	reward: 133.0	 ewma reward: 71.52510956454235
Episode 2152	length: 132	reward: 132.0	 ewma reward: 74.54885408631522
Episode 2153	length: 14	reward: 14.0	 ewma reward: 71.52141138199946
Episode 2154	length: 120	reward: 120.0	 ewma reward: 73.94534081289947
Episode 2155	length: 124

Episode 2257	length: 200	reward: 200.0	 ewma reward: 194.97701712843843
Episode 2258	length: 200	reward: 200.0	 ewma reward: 195.2281662720165
Solved! Running reward is now 195.2281662720165 and the last episode runs to 200 time steps!
Episode 1	Reward: 200.0
Episode 2	Reward: 200.0
Episode 3	Reward: 18.0
Episode 4	Reward: 200.0
Episode 5	Reward: 17.0
Episode 6	Reward: 200.0
Episode 7	Reward: 200.0
Episode 8	Reward: 200.0
Episode 9	Reward: 200.0
Episode 10	Reward: 200.0
