# 1. Import Dependencies

In [1]:
#!pip install -q stable-baselines3[extra]

In [1]:
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
from stable_baselines3 import PPO
import pandas as pd
from sklearn.model_selection import train_test_split

# 3. Building an Environment

In [2]:
X_df = pd.read_json('data/X_DM.json', orient='index')
y_df = pd.DataFrame(pd.read_json('data/y_DM.json').values.T)
df_new = X_df.drop(['Instruction', 'Coordinate'], axis=1)
instr_df = pd.DataFrame(df_new['Instruction Confidence'].to_list())
coord_df = pd.DataFrame(df_new['Coordinate Confidence'].to_list())
coord_df0 = pd.DataFrame(coord_df[0].to_list())
coord_df1 = pd.DataFrame(coord_df[1].to_list())

# column name
no_puzzlepieces = 15
a = [f'Instr{i}' for i in range(3)]
b = [f'LV{i}' for i in range(no_puzzlepieces)]
c = [f'G{i}' for i in range(no_puzzlepieces)]

col_names = []
col_names.extend(a)
col_names.extend(b)
col_names.extend(c)

final_df_noisy = pd.concat([instr_df, coord_df0, coord_df1], axis=1)
final_df_noisy.columns = col_names

# Tensorize X and y
model1_X = final_df_noisy.values
model1_y = y_df.values

train1_X, test1_X, train1_y, test1_y = train_test_split(model1_X[:100000], model1_y[:100000], test_size=1/3, random_state=42)

In [3]:
class ActionEnv(Env):
    def __init__(self):
        # number of pentomino pieces
        no_pieces = 15 + 1 + 1 #pieces + do nothing + lang team
        
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate, 
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(no_pieces)
        
        # Temperature array
        self.observation_space = Box(low=0, high=1, shape=(train1_X.shape[1],))

        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        self.state  = train1_X[i]
        self.p_gold = train1_y[i]
 
        
    def step(self, action):
        '''Our actions do not affect our state, because we only have one state, the start state. Also we don't need 
        the shower_length which represents the lenght of the sequence, but at the end we use it in the evalulation-
        function soo .. dunno.
        '''
    

        if action == self.p_gold:
            reward = 1
        else:
            reward = -1
        
        # Check if sequence is done
        '''we have a sequence of one state: the output that the other groups give us. based on that 
        we make a decision and a new round starts. so we don't need to count down the seconds of a 60min long
        shower to know when a round is over.'''
        done = True
        
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        '''Reset start state (output of previous groups) when new round starts (use "random-formula" that we need to 
        create above)'''
        global i
        self.state = train1_X[i]
        self.p_gold = train1_y[i]
        i += 1
        if i == len(train1_X):
            i = 0

        return self.state

# 5. Train Model

In [5]:

model1 = PPO("MlpPolicy", ActionEnv(), verbose=1)
model1.learn(total_timesteps=200000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.98    |
| time/              |          |
|    fps             | 1407     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -0.72       |
| time/                   |             |
|    fps                  | 987         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.101924926 |
|    clip_fraction        | 0.233       |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x2a93af0e850>

In [15]:
#model1.save('RL_action_model')

# Evaluation

In [6]:
i = 0
model1 = PPO.load('RL_action_model', ActionEnv())

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [7]:
correct = 0
for i in range(len(test1_X)):
    obs = test1_X[i]
    p_gold = test1_y[i]
    pred, _ = model1.predict(obs)
    if pred==p_gold:
        correct += 1
accuracy = correct/len(test1_X)
print(accuracy)

0.8139737205255895


# Model 2

In [9]:
model2_y = np.ones((100000,))
for i in range(100000, 200000):
    obs = model1_X[i]
    p_gold = model1_y[i]
    pred, _ = model1.predict(obs)
    if pred==p_gold:
        model2_y[i-100000] = 0

In [10]:
train2_X, test2_X, train2_y, test2_y = train_test_split(model1_X[100000:], model2_y, test_size=1/3, random_state=42)

In [11]:
class UncertaintyEnv(Env):
    def __init__(self):
        # ACTIONS
        '''for now: we only decide for one coordinate --> no of actions = number of coordinates)
        (for later: we only decide for one coordinate out of all absolute coordinates and moving coordinate,
        boolean value expressing uncertainty)'''
        self.action_space = Discrete(2)

        # Temperature array
        self.observation_space = Box(low=0, high=1, shape=(train2_X.shape[1],))

        # POSSIBLE STATES
        '''Set start state (in our case start state is the only state and here we need to get our training data in or
        construct a random formula that will randomly generate possible scenarios each time the function is called.
        for now: create random probabilities for 9 coordinates that add up to one
        uplevel: do the same and make sure that all follow the true distribution which is p=1 for the gold_coordinate and p=0 for all others
        uplevel: do the same + add possiblity for moving vector'''
        self.state  = train2_X[i]
        self.p_gold = train2_y[i]


    def step(self, action):
        '''Our actions do not affect our state, because we only have one state, the start state. Also we don't need
        the shower_length which represents the lenght of the sequence, but at the end we use it in the evalulation-
        function soo .. dunno.
        '''


        if action == self.p_gold:
            reward = 1
        else:
            reward = -1

        # Check if sequence is done
        '''we have a sequence of one state: the output that the other groups give us. based on that
        we make a decision and a new round starts. so we don't need to count down the seconds of a 60min long
        shower to know when a round is over.'''
        done = True

        info = {}

        # Return step information
        return self.state, reward, done, info

    def render(self):
        # Implement viz
        pass

    def reset(self):
        '''Reset start state (output of previous groups) when new round starts (use "random-formula" that we need to
        create above)'''
        global i
        self.state = train2_X[i]
        self.p_gold = train2_y[i]
        i += 1
        if i == len(train2_X):
            i = 0

        return self.state

In [13]:
i = 0
model2 = PPO("MlpPolicy", UncertaintyEnv(), verbose=1)
model2.learn(total_timesteps=200000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.14     |
| time/              |          |
|    fps             | 1501     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 0.28        |
| time/                   |             |
|    fps                  | 1053        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.044250324 |
|    clip_fraction        | 0.568       |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x2a93a6ed400>

In [12]:
i = 0
model2 = PPO.load('RL_uncertainty_model', UncertaintyEnv())

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [13]:
correct = 0
for i in range(len(test2_X)):
    obs = test2_X[i]
    p_gold = test2_y[i]
    pred, _ = model2.predict(obs)
    if pred==p_gold:
        correct += 1
accuracy2 = correct/len(test2_X)
print(accuracy2)

0.9454610907781844


In [16]:
#model2.save('RL_uncertainty_model')