In [None]:
import numpy as np
import pandas as pd
import enum
import sys
from pylab import plt
%matplotlib inline

import gym
import gym.spaces
from gym.utils import seeding
from gym.envs.registration import EnvSpec

import talib

sys.path.append("..") 
from data_provider import  read_prices, read_data
from tp_config import *

In [None]:
ticket = 'BTC-USDT'
tf = '1h'
#tf = '5m'
from_date = '2020-01-01'

prices = read_data(ticket, tf, from_date)

lead_win =12
lag_win  = 26
fast_ma = talib.MA(prices["C"].values, lead_win)
slow_ma = talib.MA(prices["C"].values, lag_win)
volume  = prices["V"].values
volume_ma =  talib.MA(volume, lag_win)

fast_ma = fast_ma[lag_win:]
slow_ma = slow_ma[lag_win:]
volume  = volume[lag_win:]
volume_ma = volume_ma[lag_win:]
c_price = prices["C"].values[lag_win:]
times = prices["T"].values[lag_win:]

xx = fast_ma / slow_ma - 1
quantiles = np.quantile(xx, [0.2, 0.45, 0.55, 0.8])
ma_bins = [sum(x > quantiles) - 2 for x in xx]
               
xx = volume / volume_ma - 1
quantiles = np.quantile(xx, [0.2, 0.45, 0.55, 0.8])
volume_bins = [sum(x > quantiles) for x in xx]
volume_bins

df_states = pd.DataFrame()
df_states['T']      = times
df_states['price']  = c_price
df_states['ma']     = ma_bins
df_states['volume'] = volume_bins

prices = prices.iloc[lag_win:]
prices = prices.reset_index(drop = True)

In [None]:
class Actions(enum.Enum):
    Skip = 0
    Buy = 1
    Close = 2


class State:
    def __init__(self, states, bars_count, commission_perc, reward_on_close=True,):
        assert isinstance(bars_count, int)
        assert bars_count > 0
        
        self.bars_count = bars_count
        self.commission_perc = commission_perc
        self.reward_on_close = reward_on_close
        self.reset(states, bars_count)
        
    def reset(self, states, offset):
        assert offset >= self.bars_count - 1
        self.equity = 0
        self.market_position = False
        self.open_price = 0.0
        self._states = states
        self._offset = offset

    @property
    def space_type(self):
        return gym.spaces.Box(low=-2, high=4, shape = [3], dtype = np.int32)
    
    @property
    def n(self):
        return 4*4*2
    
    def encode(self):
        res = []
        row = self._states.iloc[self._offset]
        res.append(row['ma'])
        res.append(row['volume'])
        res.append(self.market_position)
        return ' '.join(map(str, res))

    def step(self, action):
        """
        Perform one step in our price, adjust offset, check for the end of prices
        and handle position change
        :param action:
        :return: reward, done
        """
        assert isinstance(action, Actions)
        reward = 0.0
        done = False
        close = self._cur_close()
        if action == Actions.Buy and not self.market_position:
            self.market_position = True
            self.open_price = close
            reward -= self.open_price * self.commission_perc
        elif action == Actions.Close and self.market_position:
            self.market_position = False
            reward -= close * self.commission_perc
            if self.reward_on_close:
                reward += close - self.open_price
            self.open_price = 0.0
            
        self._offset += 1
        prev_close = close
        close = self._cur_close()
        done |= self._offset >= self._states.shape[0]-1

        if self.market_position and not self.reward_on_close:
            reward += close - prev_close 

        return reward, done
    def _cur_close(self):
        """
        Calculate real close price for the current bar
        """
        close = self._states.price[self._offset]
        return close

class StocksEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    spec = EnvSpec("StocksEnv-v0")

    def __init__(self, bars_count, commission, prices, states, random_ofs_on_reset, reward_on_close=True): 
        assert prices.shape[0] == states.shape[0]
        self._prices = prices
        self._states = states
        self.action_space = gym.spaces.Discrete(n=len(Actions))
        self._state = State(states, bars_count, commission)
        self.observation_space = self._state.space_type
        self.n_states = self._state.n
        self.random_ofs_on_reset = random_ofs_on_reset
        self.seed()

    def reset(self):
        # make selection of the instrument and it's offset. Then reset the state
        bars = self._state.bars_count
        if self.random_ofs_on_reset:
            offset = self.np_random.choice(
                self._states.shape[0] - bars) + bars
        else:
            offset = bars
        self._state.reset(self._states, offset)
        return self._state.encode()

    def step(self, action_idx):
        action = Actions(action_idx)
        reward, done = self._state.step(action)
        obs = self._state.encode()
        info = {
            "offset": self._state._offset
        }
        return obs, reward, done, info

    def render(self, mode='human', close=False):
        pass

    def close(self):
        pass

    def seed(self, seed=None):
        self.np_random, seed1 = seeding.np_random(seed)
        seed2 = seeding.hash_seed(seed1 + 1) % 2 ** 31
        return [seed1, seed2]

In [160]:
#!/usr/bin/env python3
import gym
import collections
from tensorboardX import SummaryWriter

GAMMA = 0.9
TEST_EPISODES = 20


class Agent:
    def __init__(self, env):
        self.env = env
        self.state = self.env.reset()
        self.rewards = collections.defaultdict(float)
        self.transits = collections.defaultdict(
            collections.Counter)
        self.values = collections.defaultdict(float)

    def play_n_random_steps(self, count):
        for _ in range(count):
            action = self.env.action_space.sample()
            new_state, reward, is_done, _ = self.env.step(action)
            self.rewards[(self.state, action, new_state)] = reward
            self.transits[(self.state, action)][new_state] += 1
            self.state = self.env.reset() if is_done else new_state

    def calc_action_value(self, state, action):
        target_counts = self.transits[(state, action)]
        total = sum(target_counts.values())
        action_value = 0.0
        for tgt_state, count in target_counts.items():
            reward = self.rewards[(state, action, tgt_state)]
            val = reward + GAMMA * self.values[tgt_state]
            action_value += (count / total) * val
        return action_value

    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            action = self.select_action(state)
            new_state, reward, is_done, _ = env.step(action)
            self.rewards[(state, action, new_state)] = reward
            self.transits[(state, action)][new_state] += 1
            total_reward += reward
            if is_done:
                break
            state = new_state

        return total_reward

    def value_iteration(self):
        states_list = [x[0] for x in self.transits.keys()]
        for state in states_list :   
 #          print(state)
            state_values = [
                self.calc_action_value(state, action)
                for action in range(self.env.action_space.n)
            ]
#            print(state_values)
            self.values[state] = max(state_values)

In [163]:
bars_count = 1
commission = 0.00075

env = StocksEnv(bars_count, commission, prices, df_states, False)
agent = Agent(env)
test_env = StocksEnv(bars_count, commission, prices, df_states, False)

writer = SummaryWriter(comment="-v-iteration")

iter_no = 0
best_reward = 0.0

In [None]:
while True:
    iter_no += 1
    agent.play_n_random_steps(100)
    agent.value_iteration()

    reward = 0.0
    for _ in range(TEST_EPISODES):
        reward += agent.play_episode(test_env)
        print(reward)
    reward /= TEST_EPISODES
    writer.add_scalar("reward", reward, iter_no)
    if reward > best_reward:
        print("Best reward updated %.3f -> %.3f" % (
            best_reward, reward))
        best_reward = reward
    if reward > 800:
        print("Solved in %d iterations!" % iter_no)
        break
writer.close()

In [None]:
env.reset()

In [None]:
env = gym.make('FrozenLake-v0')
env.reset()

In [None]:
env.observation_space.sample()

In [None]:
#Initialize table with all zeros
Q = np.zeros([env.observation_space.n,env.action_space.n])
# Set learning parameters
lr = .8
y = .95
num_episodes = 2000
#create lists to contain total rewards and steps per episode
#jList = []
rList = []

In [None]:
env.step(a)


In [None]:
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
        #Choose an action by greedily (with noise) picking from Q table
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        #Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        rAll += r
        s = s1
        if d == True:
            break
    #jList.append(j)
    rList.append(rAll)

In [None]:
rList