In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from PIL import Image

import torchvision
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


In [2]:
env = gym.make("CartPole-v1")

In [3]:
def Random_games():
    for episode in range(10):
        env.reset()
        for t in range(500):

            # display를 해주는 역할
            env.render()
            
            # action을 정해줌(0은 왼쪽 1은 오른쪽)
            action = env.action_space.sample()
            
            # action에 따라 실행하고 관찰된 환경을 return 해줌
            next_state, reward, done, info = env.step(action)
            
            print(f'''t : {episode}-{t}\nnext_state : {next_state}\nreward : {reward}\ndone : {done}\ninfo : {info}\naction : {action}''')
            print()
            if done:
                break                        

### 에피소드 종료
에피소드는 다음 중 하나로 종료된다
1. 각도가 ±12° 이상
2. 카트 위치가 ±2.4 이상(카트 중앙이 디스플레이 가장자리에 도달)
3. episode length가 500보다 큼(v0의 경우 200).

### 각 항목들의 의미(공식문서)  

#### next_state
 l, r, t, b : 좌 우 상 하 좌표를 의미

#### reward
매 step 마다 1의 보상

#### done
종료조건을 만족하여 gameover를 의미

`done = bool(`  
`            x < -self.x_threshold`  
`            or x > self.x_threshold`  
`            or theta < -self.theta_threshold_radians`  
`            or theta > self.theta_threshold_radians`  
`        )`  

#### action
1은 cart를 오른쪽으로, 0은 왼쪽으로 움직임을 의미

    | Num | Action                 |
    |-----|------------------------|
    | 0   | Push cart to the left  |
    | 1   | Push cart to the right |
    
    

In [4]:
Random_games()

t : 0-0
next_state : [ 0.02741178  0.16979925 -0.01789352 -0.34326407]
reward : 1.0
done : False
info : {}
action : 1

t : 0-1
next_state : [ 0.03080776  0.36517113 -0.0247588  -0.6415354 ]
reward : 1.0
done : False
info : {}
action : 1

t : 0-2
next_state : [ 0.03811118  0.17040291 -0.03758951 -0.35675094]
reward : 1.0
done : False
info : {}
action : 0

t : 0-3
next_state : [ 0.04151924 -0.02416501 -0.04472453 -0.07615392]
reward : 1.0
done : False
info : {}
action : 0

t : 0-4
next_state : [ 0.04103594  0.17156862 -0.04624761 -0.3826054 ]
reward : 1.0
done : False
info : {}
action : 1

t : 0-5
next_state : [ 0.04446732  0.36731568 -0.05389972 -0.68950397]
reward : 1.0
done : False
info : {}
action : 1

t : 0-6
next_state : [ 0.05181363  0.56314254 -0.06768979 -0.9986566 ]
reward : 1.0
done : False
info : {}
action : 1

t : 0-7
next_state : [ 0.06307648  0.3689876  -0.08766293 -0.7279767 ]
reward : 1.0
done : False
info : {}
action : 0

t : 0-8
next_state : [ 0.07045623  0.565205   -0

t : 3-12
next_state : [ 0.0911726  0.5612933 -0.111091  -0.8759094]
reward : 1.0
done : False
info : {}
action : 1

t : 3-13
next_state : [ 0.10239846  0.3678425  -0.1286092  -0.6201155 ]
reward : 1.0
done : False
info : {}
action : 0

t : 3-14
next_state : [ 0.10975531  0.5645038  -0.1410115  -0.9503781 ]
reward : 1.0
done : False
info : {}
action : 1

t : 3-15
next_state : [ 0.12104539  0.3715325  -0.16001907 -0.7051144 ]
reward : 1.0
done : False
info : {}
action : 0

t : 3-16
next_state : [ 0.12847604  0.56846726 -0.17412135 -1.0435877 ]
reward : 1.0
done : False
info : {}
action : 1

t : 3-17
next_state : [ 0.13984539  0.7654196  -0.19499311 -1.3854824 ]
reward : 1.0
done : False
info : {}
action : 1

t : 3-18
next_state : [ 0.15515378  0.573189   -0.22270276 -1.1595663 ]
reward : 1.0
done : True
info : {}
action : 0

t : 4-0
next_state : [-0.0444962  -0.16960007 -0.03689789  0.23838653]
reward : 1.0
done : False
info : {}
action : 0

t : 4-1
next_state : [-0.0478882   0.02602905 

t : 6-22
next_state : [-0.03952244  0.17526628  0.19901237  0.31254235]
reward : 1.0
done : False
info : {}
action : 1

t : 6-23
next_state : [-0.03601712 -0.02205187  0.20526321  0.6608019 ]
reward : 1.0
done : False
info : {}
action : 0

t : 6-24
next_state : [-0.03645815  0.1697122   0.21847925  0.4391187 ]
reward : 1.0
done : True
info : {}
action : 1

t : 7-0
next_state : [ 0.00105308  0.1542101  -0.00138507 -0.32871974]
reward : 1.0
done : False
info : {}
action : 1

t : 7-1
next_state : [ 0.00413728 -0.0408921  -0.00795947 -0.03647393]
reward : 1.0
done : False
info : {}
action : 0

t : 7-2
next_state : [ 0.00331944  0.15434308 -0.00868895 -0.33165747]
reward : 1.0
done : False
info : {}
action : 1

t : 7-3
next_state : [ 0.0064063  -0.04065412 -0.01532209 -0.04172724]
reward : 1.0
done : False
info : {}
action : 0

t : 7-4
next_state : [ 0.00559322 -0.23555304 -0.01615664  0.2460823 ]
reward : 1.0
done : False
info : {}
action : 0

t : 7-5
next_state : [ 0.00088216 -0.43044055 

t : 9-36
next_state : [ 0.12299116  0.26434946 -0.20465903 -0.8661512 ]
reward : 1.0
done : False
info : {}
action : 0

t : 9-37
next_state : [ 0.12827815  0.46158043 -0.22198205 -1.215574  ]
reward : 1.0
done : True
info : {}
action : 1



[실제 영상](https://www.youtube.com/watch?v=5Q14EjnOJZc)