# 1. Import Dependencies

In [3]:
!pip install -q stable-baselines3[extra]

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-api-core 2.10.1 requires protobuf<5.0.0dev,>=3.20.1, but you have protobuf 3.19.5 which is incompatible.


In [4]:
import pickle
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
from stable_baselines3 import PPO
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 1. Action Model

The action model predicts either a puzzle piece ID, to do nothing or to only forward the language group's output.

In [5]:
with open('data/X_DM.pickle', 'rb') as X_file:
    model1_X = pickle.load(X_file)
with open('data/y_DM.pickle', 'rb') as y_file:
    model1_y = pickle.load(y_file)

half = int(len(model1_X)/2)
train1_X, test1_X, train1_y, test1_y = train_test_split(model1_X[:half], model1_y[:half], test_size=1/3, random_state=42)

## 1.1 Environment Definition


In [6]:
class ActionEnv(Env):
    def __init__(self):
        
        # ACTIONS
        '''an action can be to pick up one out of no_pieces pieces, do nothing or to forward the lang team output'''
        no_pieces = 15 + 1 + 1 
        self.action_space = Discrete(no_pieces)
        
        # OBSERVATIONS 
        '''currently we only observe the start state'''
        self.observation_space = Box(low=0, high=1, shape=(train1_X.shape[1],))

        # POSSIBLE STATES
        ''''here the start state is defined. since we used our synthetic data also for DL and testing the 
        rule based approach, we read in the data instead of creating them on the fly in every timestep.
        since init is called only once, this we only do here to initiate .state and .p_gold. the start state 
        needs to be '''
        self.state  = train1_X[i] # the output of previous groups at a given time
        self.p_gold = train1_y[i] # the correct action (the name is a bit confusing, because inititally we wanted to train the model to choose the highest probability)
 
        
    def step(self, action):
        '''Our actions do not cause a state transition, because we only have one state, the start state. 
        We only want to reward the correct action and punish a wrong one.
        '''
        if action == self.p_gold:
            reward = 1 
        else:
            reward = -1
        
        # Check if sequence is done
        '''since we currently play single-round games the game is over after chosing an action and receiving the reward'''
        done = True
        info = {}
        
        # Return step information
        return self.state, reward, done, info

    def render(self):
        ''' if we'd play a game that makes sense to visuzalize like cartpole this would be done here'''
        pass
    
    def reset(self):
        '''here the start state is really defined.'''
        global i
        self.state = train1_X[i]
        self.p_gold = train1_y[i]
        i += 1
        if i == len(train1_X):
            i = 0

        return self.state

## 1.2 Training

In [7]:
# i = 0
# model1 = PPO.load('RL_action_model', env=None)

In [8]:
i = 0
model1 = PPO("MlpPolicy", ActionEnv(), verbose=1)
model1.learn(total_timesteps=2/3*half)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.82    |
| time/              |          |
|    fps             | 1080     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -0.8        |
| time/                   |             |
|    fps                  | 874         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.012261825 |
|    clip_fraction        | 0.0901      |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x173745f5c70>

In [9]:
model1.save('RL_action_model')

## 1.3 Evaluation

In [10]:
i = 0

correct = 0
for i in tqdm(range(len(test1_X))):
    obs = test1_X[i]
    p_gold = test1_y[i]
    pred, _ = model1.predict(obs)
    if pred==p_gold:
        correct += 1
accuracy = correct/len(test1_X)
print(accuracy)

100%|██████████| 333334/333334 [02:31<00:00, 2203.14it/s]

0.9723700552598895





# 2 Uncertainty Model

This model predicts whether the action model is more likely predicting a correct or incorrect action. 

In [11]:
model2_y = np.ones((half,))
for i in range(half, 2*half):
    obs = model1_X[i]
    p_gold = model1_y[i]
    pred, _ = model1.predict(obs)
    if pred==p_gold:
        model2_y[i-half] = 0
        
train2_X, test2_X, train2_y, test2_y = train_test_split(model1_X[half:], model2_y, test_size=1/3, random_state=42)

## 2.1 Environment Definition

In [12]:
class UncertaintyEnv(Env):
    def __init__(self):
        # ACTIONS
        '''an action can be to predict either uncertainty (1) or certainty (0)'''
        self.action_space = Discrete(2)

        # OBSERVATION
        '''currently we only observe the start state'''

        self.observation_space = Box(low=0, high=1, shape=(train2_X.shape[1],))

        # POSSIBLE STATES
        ''''here the start state is defined. since we used our synthetic data also for DL and testing the 
        rule based approach, we read in the data instead of creating them on the fly in every timestep.
        since init is called only once, this we only do here to initiate .state and .p_gold. the start state 
        needs to be '''
        self.state  = train2_X[i]
        self.p_gold = train2_y[i]


    def step(self, action):
        '''Our actions do not cause a state transition, because we only have one state, the start state. 
        We only want to reward the correct action and punish a wrong one.
        '''        
        if action == self.p_gold:
            reward = 1
        else:
            reward = -1

        # Check if sequence is done
        '''since we currently play single-round games the game is over after chosing an action and receiving the reward'''

        done = True
        info = {}

        # Return step information
        return self.state, reward, done, info

    def render(self):
        ''' if we'd play a game that makes sense to visuzalize like cartpole this would be done here'''

        pass

    def reset(self):
        '''here the start state is really defined.'''
        global i
        
        self.state = train2_X[i]
        self.p_gold = train2_y[i]
        i += 1
        if i == len(train2_X):
            i = 0

        return self.state

## 2.2 Training

In [13]:
# i = 0
# model2 = PPO.load('RL_uncertainty_model', env=None)

In [14]:
i = 0
model2 = PPO("MlpPolicy", UncertaintyEnv(), verbose=1)
model2.learn(total_timesteps=2/3*half)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.02    |
| time/              |          |
|    fps             | 1676     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 0.12        |
| time/                   |             |
|    fps                  | 1097        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.043866485 |
|    clip_fraction        | 0.861       |
|    clip_range           | 0.2         |
|    entropy_loss   

<stable_baselines3.ppo.ppo.PPO at 0x1737d36d5b0>

In [17]:
model2.save('RL_uncertainty_model')

## 2.3 Evaluation

In [16]:
i = 0

correct = 0
for i in tqdm(range(len(test2_X))):
    obs = test2_X[i]
    p_gold = test2_y[i]
    pred, _ = model2.predict(obs)
    if pred==p_gold:
        correct += 1
accuracy2 = correct/len(test2_X)
print(accuracy2)

100%|██████████| 333334/333334 [02:11<00:00, 2527.83it/s]

0.972004055991888



