In [None]:
import gym

# Common imports
import numpy as np
import random
import os
import collections

# to make this notebook's output stable across runs
np.random.seed(42)


env = gym.make('CartPole-v1')

# Best random sampling




In [15]:
#obs = []
def get_random_actions(env, seed):
    actions = []

    env.seed(seed)
    env.reset()

    step=0
    while True:
        step+=1
        action = env.action_space.sample()
        ob, rewards, terminated, info = env.step(action)
        actions.append(action)
        if terminated: break
    
    return actions


def explore_actions(env, change_node, actions):
    env.seed(seed)
    env.reset()

    new_actions = []
    step=0
    for action in actions[:-change_node]:
        step+=1
        ob, rewards, terminated, info = env.step(action)
        new_actions.append(action)
        if terminated: break

    if not terminated:
        # print("continue to explore", len(new_actions))
        while True:
            step+=1
            action = env.action_space.sample()
            ob, rewards, terminated, info = env.step(action)
            new_actions.append(action)
            if terminated: break

    return new_actions


def find_best_actions(env, actions):
    best_actions=[]
    best_obs = []
    for ep in range(30):
        # print(f" {ep} ".center(80, '*'))
        change_node=1
        if len(actions) == 500:
            return actions
        if len(best_actions)>0:
            actions = best_actions
        best_actions = []
        while change_node<len(actions):
            new_actions = explore_actions(env, change_node, actions)
            if len(new_actions)>len(actions):
                # print(len(new_actions), len(actions), change_node)
                if len(new_actions)>len(best_actions):
                    # print(len(new_actions), len(actions), change_node)
                    best_actions=new_actions
            change_node+=1
        
    return best_actions if len(best_actions)>len(actions) else actions


def get_obs(env, actions_result):
    obs_res = []
    for key in actions_result:
        env.seed(key)
        ob = env.reset()
        actions = actions_result[key]
        obs = []
        for action in actions:
            obs.append(ob)
            ob, rewards, terminated, info = env.step(action)
            
            if terminated: break
        
        obs_res.append(np.array(obs))
    
    return obs_res



In [16]:
# seed = 10
# actions = get_random_actions(env, seed)
# best_actions = find_best_actions(env, actions)

In [17]:
res = {}
seeds = [0, 1, 5, 10, 21, 42, 47, 63, 84, 100, 121, 144]
for seed in seeds:
    actions = get_random_actions(env, seed)
    best_actions = find_best_actions(env, actions)
    print(f"[{seed}] Actions length improvement:",len(actions), "->", len(best_actions))
    res[seed] = best_actions

[0] Actions length improvement: 10 -> 240
[1] Actions length improvement: 12 -> 338
[5] Actions length improvement: 17 -> 152
[10] Actions length improvement: 25 -> 500
[21] Actions length improvement: 16 -> 500
[42] Actions length improvement: 21 -> 463
[47] Actions length improvement: 19 -> 388
[63] Actions length improvement: 17 -> 185
[84] Actions length improvement: 19 -> 500
[100] Actions length improvement: 16 -> 213
[121] Actions length improvement: 18 -> 500
[144] Actions length improvement: 14 -> 500


In [18]:
obs = get_obs(env, res)
array_obs = np.concatenate(obs[:])
array_obs.shape

(4479, 4)

In [19]:
array_actions = np.concatenate([np.array(act[1]) for act in res.items()][:])

# Plot data

In [None]:
import matplotlib.pyplot as plt

color = array_actions
speed = array_obs[:, 3]
theta = array_obs[:, 2]
plt.figure(figsize=(7, 3), dpi=200)
plt.scatter(theta, speed, c=color, s=1), #print(obs[:, 2].shape)
plt.vlines([0.032, -0.032], ymin=min(speed), ymax=max(speed))
plt.hlines([0], xmin=min(theta), xmax=max(theta))
plt.xlabel(r'$\theta$')
plt.ylabel('omega')
plt.colorbar();

# Train LightGBM

In [21]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

lbc = LGBMClassifier(n_estimators=100)

lbc.fit(array_obs, array_actions)
accuracy_score(array_actions, lbc.predict(array_obs))

0.8070997990622907

In [22]:
lbc.feature_importances_

array([791, 597, 805, 807])

# Test model on environment
It suuucks, I saved *next_obs* and not the actual obs

In [23]:
steps = []
for seed in range(1, 200, 5):
    env.seed(seed)
    ob = env.reset()

    step = 0
    while True:
        # print(ob.reshape(1, -1).shape)
        action = lbc.predict(ob.reshape(1, -1))[0]
        # action = res[seeds[4]][step]
        ob, reward, terminated, info = env.step(action)
        step+=1
        if terminated: steps.append(step);break
        
    print("Seed:", seed, '->',step)

1 : 500
6 : 500
11 : 500
16 : 500
21 : 500
26 : 500
31 : 500
36 : 500
41 : 500
46 : 500
51 : 500
56 : 500
61 : 500
66 : 500
71 : 500
76 : 500
81 : 500
86 : 500
91 : 500
96 : 500
101 : 500
106 : 500
111 : 500
116 : 500
121 : 500
126 : 500
131 : 500
136 : 500
141 : 500
146 : 500
151 : 500
156 : 500
161 : 500
166 : 500
171 : 500
176 : 500
181 : 500
186 : 500
191 : 500
196 : 500


In [24]:
np.median(steps), np.mean(steps), np.min(steps), np.max(steps)

(500.0, 500.0, 500, 500)