# 1. Import Dependencies

In [None]:
#!pip install -q stable-baselines3[extra]

In [None]:
import pickle
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
from stable_baselines3 import PPO
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 3. Building an Environment

In [None]:
with open('data/X_DM.pickle', 'rb') as X_file:
    model1_X = pickle.load(X_file)
with open('data/y_DM.pickle', 'rb') as y_file:
    model1_y = pickle.load(y_file)

half = int(len(model1_X)/2)
train1_X, test1_X, train1_y, test1_y = train_test_split(model1_X[:half], model1_y[:half], test_size=1/3, random_state=42)

In [None]:
class ActionEnv(Env):
    def __init__(self):
        # number of pentomino pieces
        no_pieces = 15 + 1 + 1 #pieces + do nothing + lang team
        
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate, 
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(no_pieces)
        
        # Temperature array
        self.observation_space = Box(low=0, high=1, shape=(train1_X.shape[1],))

        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        self.state  = train1_X[i]
        self.p_gold = train1_y[i]
 
        
    def step(self, action):
        '''Our actions do not affect our state, because we only have one state, the start state. Also we don't need 
        the shower_length which represents the lenght of the sequence, but at the end we use it in the evalulation-
        function soo .. dunno.
        '''
    

        if action == self.p_gold:
            reward = 1
        else:
            reward = -1
        
        # Check if sequence is done
        '''we have a sequence of one state: the output that the other groups give us. based on that 
        we make a decision and a new round starts. so we don't need to count down the seconds of a 60min long
        shower to know when a round is over.'''
        done = True
        
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        '''Reset start state (output of previous groups) when new round starts (use "random-formula" that we need to 
        create above)'''
        global i
        self.state = train1_X[i]
        self.p_gold = train1_y[i]
        i += 1
        if i == len(train1_X):
            i = 0

        return self.state

# 5. Train Model

In [None]:
i = 0
model1 = PPO("MlpPolicy", ActionEnv(), verbose=1)
model1.learn(total_timesteps=half)

# Evaluation

In [None]:
i = 0
#model1 = PPO.load('RL_action_model', ActionEnv())

In [None]:
correct = 0
for i in tqdm(range(len(test1_X))):
    obs = test1_X[i]
    p_gold = test1_y[i]
    pred, _ = model1.predict(obs)
    if pred==p_gold:
        correct += 1
accuracy = correct/len(test1_X)
print(accuracy)

In [None]:
# model1.save('RL_action_model')

# Model 2

In [None]:
model2_y = np.ones((half,))
for i in range(half, 2*half):
    obs = model1_X[i]
    p_gold = model1_y[i]
    pred, _ = model1.predict(obs)
    if pred==p_gold:
        model2_y[i-half] = 0

In [None]:
train2_X, test2_X, train2_y, test2_y = train_test_split(model1_X[half:], model2_y, test_size=1/3, random_state=42)

In [None]:
class UncertaintyEnv(Env):
    def __init__(self):
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate,
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(2)

        # Temperature array
        self.observation_space = Box(low=0, high=1, shape=(train2_X.shape[1],))

        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        self.state  = train2_X[i]
        self.p_gold = train2_y[i]


    def step(self, action):
        '''Our actions do not affect our state, because we only have one state, the start state. Also we don't need
        the shower_length which represents the lenght of the sequence, but at the end we use it in the evalulation-
        function soo .. dunno.
        '''


        if action == self.p_gold:
            reward = 1
        else:
            reward = -1

        # Check if sequence is done
        '''we have a sequence of one state: the output that the other groups give us. based on that
        we make a decision and a new round starts. so we don't need to count down the seconds of a 60min long
        shower to know when a round is over.'''
        done = True

        info = {}

        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass

    def reset(self):
        '''Reset start state (output of previous groups) when new round starts (use "random-formula" that we need to
        create above)'''
        global i
        self.state = train2_X[i]
        self.p_gold = train2_y[i]
        i += 1
        if i == len(train2_X):
            i = 0

        return self.state

In [None]:
i = 0
model2 = PPO("MlpPolicy", UncertaintyEnv(), verbose=1)
model2.learn(total_timesteps=half)

In [None]:
i = 0
#model2 = PPO.load('RL_uncertainty_model', UncertaintyEnv())

In [None]:
correct = 0
for i in tqdm(range(len(test2_X))):
    obs = test2_X[i]
    p_gold = test2_y[i]
    pred, _ = model2.predict(obs)
    if pred==p_gold:
        correct += 1
accuracy2 = correct/len(test2_X)
print(accuracy2)

In [None]:
model2.save('RL_uncertainty_model')