In [243]:
import random
import gym
import numpy as np
from collections import deque


from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
#from scores.score_logger import ScoreLogger
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 100_000
RANDOM_RUNS = 10_000
BATCH_SIZE = 500

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.05
EXPLORATION_DECAY = 0.98

In [260]:
class LightQSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.best_memory = deque(maxlen=MEMORY_SIZE)

        self.model = MultiOutputRegressor(LGBMRegressor(n_estimators=100, n_jobs=-1))
        self.isFit = False

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            # print("Random")
            return random.randrange(self.action_space)
        if self.isFit == True:
            # print("Predict")
            q_values = self.model.predict(state)
        else:
            q_values = np.zeros(self.action_space).reshape(1, -1)
        return np.argmax(q_values[0])

    def experience_replay(self, steps):
        if len(self.memory) >= RANDOM_RUNS and len(steps) % 1000:
                print("Best memory", len(self.memory))
                self.best_memory.remember(np.array(self.memory)[np.argsort(steps)[-100:]])

        if len(self.best_memory) < BATCH_SIZE:
            return
            
        batch = random.sample(self.best_memory, BATCH_SIZE)
        print(batch)
        X = np.empty((BATCH_SIZE, 4))
        y = np.empty((BATCH_SIZE,2))
        for i, (state, action, reward, state_next, terminal) in enumerate(batch):
            # print(reward)
            q_update = reward
            if not terminal:
                if self.isFit:
                    q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
                    # print(self.model.predict(state_next))
                else:
                    q_update = reward
            if self.isFit:
                q_values = self.model.predict(state)
            else:
                q_values = np.zeros(self.action_space).reshape(1, -1)
                
            q_values[0][action] = q_update

            X[i] = state[0]
            y[i] = q_values[0]
            
        # print(reward)
        print(" Training ".center(80, '*'))
        # print(X)
        # print(y)
        self.model.fit(X, y)
        self.isFit = True
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
    
class XGBQSolver(LightQSolver):
    def __init__(self, observation_space, action_space):
        super().__init__(observation_space, action_space)
        self.model = XGBRegressor(n_estimators=100)

In [278]:
best_params = []
best_index = []
all_data = []
max_len = 10
for i in range(100):
    r = random.randint(0, 100)
    best_params.append(r)
    all_data.append(r)
    if len(best_params) > max_len:
       
        best_params =  sorted(best_params)[-10:]
        best_index = np.argsort(best_params)[-10:]

best_params, best_index

([87, 88, 89, 89, 92, 92, 95, 96, 98, 100],
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64))

In [276]:
# reimplement main
sorted(best_params)

[12, 14, 29, 34, 36, 41, 48, 72, 80, 93]

In [None]:
steps = []
for i in range(1000):
    env.reset()
    action = 0
    prev_action = 0
    step = 0
    while True:
        state_next, reward, terminal, info = env.step(action)
        step+=1
        #print(state_next, action)
        if action == 0:
            action=1
        else:
            action=0

        # check position
        if state_next[0] < -0.02 and state_next [2]>0.1 and state_next[1]<-0.9:
            action=1
        elif state_next[0] > 0.02 and state_next [2]<-0.1:
            action=0
        
        if terminal:
            print(state_next, action)
            break

    steps.append(step)

np.mean(steps), np.sort(steps)[-10:]

In [None]:
steps_rand = []
X = {}
y = []
for i in range(10000):
    env.reset()
    action = 0
    step = 0
    X[i] = []

    while True:
        state_next, reward, terminal, info = env.step(action)
        step+=1
        action = random.randrange(2)
        
        if terminal:
            break

        X[i].append(list(state_next))
    steps_rand.append(step)
    
np.mean(steps_rand), np.max(steps_rand), np.sort(steps_rand)[-100:]

In [None]:
[X[i] for marg in list(np.argsort(steps_rand)[-100:])]

## XGB MultiOutput regressor

In [None]:
# xgboost test
import argparse
from typing import Dict, Tuple, List

import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

def plot_predt(y: np.ndarray, y_predt: np.ndarray, name: str) -> None:
    s = 25
    plt.scatter(y[:, 0], y[:, 1], c="navy", s=s, edgecolor="black", label="data")
    plt.scatter(
        y_predt[:, 0], y_predt[:, 1], c="cornflowerblue", s=s, edgecolor="black", label='prediction'
    )
    plt.xlim([-1, 2])
    plt.ylim([-1, 2])
    plt.legend()
    plt.show()


def gen_circle() -> Tuple[np.ndarray, np.ndarray]:
    "Generate a sample dataset that y is a 2 dim circle."
    rng = np.random.RandomState(1994)
    X = 200 * rng.rand(10000, 1) - 100
    y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
    y[::5, :] += 0.5 - rng.rand(10000//5, 2)
    y = y - y.min()
    y = y / y.max()
    return X, y


def rmse_model(plot_result: bool):
    """Draw a circle with 2-dim coordinate as target variables."""
    X, y = gen_circle()
    # Train a regressor on it
    reg = xgb.XGBRegressor(tree_method="hist", n_estimators=128)

    batch_size = 1000
    first_fit = True
    for b in range(len(X)//batch_size): #batch size
        Xp, yp = X[b*batch_size:(b+1)*batch_size],  y[b*batch_size:(b+1)*batch_size]
        if first_fit:
            reg.fit(Xp, yp)
            first_fit=False
        else:
            reg = reg.fit(Xp, yp, xgb_model=reg.get_booster())
            y_predt = reg.predict(Xp)
            print(mean_absolute_error(yp, y_predt))
            # plt.plot(Xp)
            # plt.show()
            # plot_predt(yp, y_predt, 'multi')

    #plot_predt(yp, yp, "multi")
    y_predt = reg.predict(X)
    print(mean_absolute_error(y, y_predt))
    if plot_result:
        plot_predt(y, y_predt, "multi")


def rmse_model_batch(plot_result: bool):
    """Draw a circle with 2-dim coordinate as target variables."""
    X, y = gen_circle()
    # Train a regressor on it
    reg = xgb.XGBRegressor(tree_method="hist", n_estimators=64)
    reg.fit(X, y,)# eval_set=[(X, y)])

    y_predt = reg.predict(X)
    print(mean_absolute_error(y, y_predt))

    if plot_result:
        plot_predt(y, y_predt, "multi")

In [None]:
rmse_model(True)
rmse_model_batch(True)