In [1]:
!nvidia-smi

Mon May 23 17:32:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:02:00.0  On |                  N/A |
|  0%   39C    P8     5W / 190W |    456MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install pygame



In [3]:
!pip install --upgrade gym



In [4]:
!pip install ray==1.9



# Imports:

In [1]:
import gym
import numpy as np 
import os
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler
from functools import partial

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch import linalg as LA

from src.visualizations import *
from src.entropies import entanglement_entropy, classical_entropy
from src.utils import one_hot, uniform_linear_layer

## Environment setup:

In [6]:
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

activation_function = 'sigmoid'
global_seed = None

max_steps = 60
non_random_chance = 0.99
window = 100
target_win_ratio = 0.70
min_steps_num = 6

## Helpers:

In [7]:
class Agent(nn.Module):
    def __init__(self, observation_space_size, n_hidden_layers, activation_function):
        super(Agent, self).__init__()
        self.observation_space_size = observation_space_size
        self.hidden_size = 2*self.observation_space_size

        self.l1 = nn.Linear(in_features=2*self.observation_space_size, out_features=self.hidden_size)
        self.hidden_layers = [
            nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size) \
                for i in range(n_hidden_layers)
        ]
        self.l2 = nn.Linear(in_features=self.hidden_size, out_features=32) 
        self.activation = None
        if activation_function=='lrelu':
            self.activation = torch.leaky_relu
        if activation_function=='sigmoid':
            self.activation = torch.sigmoid
        if activation_function=='tanh':
            self.activation = torch.tanh

        uniform_linear_layer(self.l1)
        for l in self.hidden_layers:
            uniform_linear_layer(l)

        uniform_linear_layer(self.l2)
    
    def forward(self, state):
        obs_emb = one_hot([int(2*state)], 2*self.observation_space_size)
        # first layer:
        out1 = self.activation(self.l1(obs_emb))
        
        # hidden layers:
        for l in self.hidden_layers:
            out1 = self.activation(l(out1))
        
        # output layers:
        out2 = self.activation(self.l2(out1))

        return out2.view((-1)) 


class Trainer:
    def __init__(self, n_hidden_layers, lake, learning_rate, non_random_chance, random_scaling, gamma, activation_function):
        self.holes_indexes = np.array([5,7,11,12])

        self.lake = lake
        self.agent = Agent(self.lake.observation_space.n, n_hidden_layers, activation_function)
        self.optimizer = optim.Adam(params=self.agent.parameters(), lr=learning_rate)
        
        self.epsilon = non_random_chance
        self.epsilon_growth_rate = random_scaling
        self.gamma = gamma
        
        self.epsilon_list = []
        self.success = []
        self.jList = []
        self.reward_list = []

        self.compute_entropy = False
        self.entropies = []
        self.cl_entropies = []
        self.entropies_episodes = [0]
        
        self.print = False

    
    def train(self, epoch, max_steps, window, target_win_ratio):
        # entropies_episodes = [0] * (epoch+1)
        #pbar = tqdm(range(epoch))
        for i in range(epoch):
            #pbar.set_description(f'Success rate: {sum(self.success[-window:])/window:.2%} | Random chance: {self.epsilon:.2%}')
            
            s = self.lake.reset() #stan na jeziorze 0-16, dla resetu 0
            j = 0
            self.entropies_episodes.append(0)
            while j < max_steps:
                j += 1
                # perform chosen action
                a = self.choose_action(s)
                s1, r, d, _ = self.lake.step(int(a))
                if d == True and r == 0: r = -1
                elif d== True: r == 1
                elif r==0: r = -0.01

                # if self.print==False:
                #     print(self.agent(s)[a])
                #     self.print=True

                # calculate target and loss
                target_q = r + self.gamma * torch.max(self.calc_probabilities(s1).detach()) 

                loss = F.smooth_l1_loss(self.calc_probability(s, a), target_q) 
                # update model to optimize Q
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
                # update state
                s = s1
                if(self.compute_entropy):
                    self.entropies.append(entanglement_entropy(self.calc_statevector(s))) 
                    self.cl_entropies.append(classical_entropy(self.calc_statevector(s))) 
                    self.entropies_episodes[i] += 1
                
                if d == True: break
            
            # append results onto report lists
            if d == True and r > 0:
                self.success.append(1)
            else:
                self.success.append(0)

            self.reward_list.append(r)
            self.jList.append(j)

            if self.epsilon < 1.:
                self.epsilon *= self.epsilon_growth_rate
            self.epsilon_list.append(self.epsilon)

            tune.report(
                if_trained = sum(self.success[-window:])/window>target_win_ratio, 
                win_ratio = sum(self.success[-window:])/window,
                episode_reward_mean = sum(self.reward_list[-window:])/window,
                steps = len(self.success)
                )

            if i%10==0 and i>100:
                if sum(self.success[-window:])/window>target_win_ratio:
                    #print("Network trained before epoch limit on {i} epoch".format(i=i))
                    break

        #print("last 100 epoches success rate: " + str(sum(self.success[-100:])/100) + "%")

    def choose_action(self, s):
        self.calc_probabilities(s)
        if np.random.rand(1) > self.epsilon : 
            action = torch.argmax(self.calc_probabilities(s)) #wybor najwiekszej wartosci z tablicy
        else:
            action = torch.tensor(np.random.randint(0, 4))
        return action
    
    def calc_statevector(self, s):
        return torch.complex(self.agent(s)[0::2], self.agent(s)[1::2])

    def calc_probability(self, s, a): #liczenie prawdopodobieństwa obsadzenia kubitu (0-3) z danego stanu planszy (0-15)
        statevector = torch.complex(self.agent(s)[0::2], self.agent(s)[1::2])
        probabilities = (statevector.abs()**2)
        probabilities = probabilities/probabilities.sum() #normowanie
        prob_indexes = [
            [0,1,2,3,4,5,6,7],
            [0,1,2,3,8,9,10,11],
            [0,1,4,5,8,9,12,13],
            [0,2,4,6,8,10,12,14]
        ]
        return probabilities[prob_indexes[a]].sum()

    def calc_probabilities(self, s): #liczenie prawdopodobieństw każdego z kubitów z danego stanu planszy (0-15) do tensora o kształcie (4)
        raw_wavefunction = torch.complex(self.agent(s)[0::2], self.agent(s)[1::2])
        probabilities = (raw_wavefunction.abs()**2)
        probabilities = probabilities/probabilities.sum() #normowanie
        probs_of_qubits = torch.tensor([
            probabilities[[0,1,2,3,4,5,6,7]].sum(),
            probabilities[[0,1,2,3,8,9,10,11]].sum(),
            probabilities[[0,1,4,5,8,9,12,13]].sum(),
            probabilities[[0,2,4,6,8,10,12,14]].sum()
            ])
        return probs_of_qubits

        
    def Q(self):
        Q = []
        for x in range(self.lake.observation_space.n):
            Qstate = self.agent(x).detach()
            Qstate /= LA.norm(Qstate)
            Q.append(Qstate)   
        Q_out = torch.Tensor(self.lake.observation_space.n, self.lake.action_space.n)
        torch.cat(Q, out=Q_out)
        return Q_out
    
    def Qstate(self, state):
        Qstate = self.agent(state).detach()
        Qstate /= LA.norm(Qstate)
        return Qstate
    
    def Qstrategy(self):
        return [torch.argmax(self.calc_probabilities(state)).item() for state in range(self.lake.observation_space.n)]

# Ray wrapper:

In [8]:
def perform_experiment(config):
    # torch.manual_seed(global_seed)
    # np.random.seed(global_seed)

    results_path = os.path.join('.', 'visuals')
    
    if not os.path.exists(results_path):
        os.mkdir(results_path)

    lake = gym.make('FrozenLake-v1', is_slippery=False)
    lake.reset()

    fl = Trainer(config["n_hidden_layers"], lake, config["lr"], non_random_chance, config["random_scaling"], config["gamma"], activation_function)

    fl.train(epochs, max_steps, window, target_win_ratio)

    plot_success_steps_history(fl.jList, fl.success)

    strategy = np.array(fl.Qstrategy()).reshape((4,4))
    strategy_save_path = os.path.join(results_path, "trained_strategy.jpg")
    plot_strategy(strategy, fl.holes_indexes, strategy_save_path)

    moving_average_history_save_path = os.path.join(results_path, "training_history_moving_average.jpg")
    plot_rolling_window_history(fl.jList, fl.reward_list, fl.success, np.array(fl.epsilon_list), target_win_ratio, min_steps_num, moving_average_history_save_path, window=window)
    history_save_path = os.path.join(results_path, "training_history.jpg")
    plot_history(fl.jList, fl.reward_list, fl.success, np.array(fl.epsilon_list), target_win_ratio, min_steps_num, history_save_path)

    with open(os.path.join(results_path, "hyperparameters.txt"), "w+") as f:
        f.write(f'gamma;{config["gamma"]}\n')
        f.write(f'epochs;{epochs}\n')
        f.write(f'max_steps;{max_steps}\n')
        f.write(f'learning_rate;{config["lr"]}\n')
        f.write(f'non_random_chance;{non_random_chance}\n')
        f.write(f'random_scaling;{config["random_scaling"]}\n')
        f.write(f'window;{window}\n')
        f.write(f'target_win_ratio;{target_win_ratio}\n')
        f.write(f'min_steps_num;{min_steps_num}\n')
        f.write(f'n_hidden_layers;{config["n_hidden_layers"]}\n')
        f.write(f'activation_function;{activation_function}\n')
        f.write(f'global_seed;{global_seed}\n')

# Experiments:

## Setup:

In [9]:
epochs = 30000

config = {
    "n_hidden_layers": tune.choice([1,2]),
    "lr": tune.loguniform(1e-5, 4e-1),
    "random_scaling": tune.loguniform(0.99999, 0.9995),
    "gamma": tune.loguniform(2., 0.1)

}
scheduler = ASHAScheduler(
    time_attr='steps',
    metric='win_ratio',
    mode='max',
    max_t=epochs+1,
    grace_period=15000,
    reduction_factor=2
    )


reporter = JupyterNotebookReporter(
    overwrite=True,
    metric_columns=["if_trained", "win_ratio", "episode_reward_mean", 'steps'],
    parameter_columns=['steps'])

## Run:

In [16]:
import warnings


warnings.filterwarnings("ignore")

parallel_runs = 6
#resources_usage = 0.8
print(f'Resources used: CPU {6/parallel_runs}, GPU {1/parallel_runs} for each of {parallel_runs} parallel runs.')

Resources used: CPU 1.0, GPU 0.16666666666666666 for each of 6 parallel runs.


In [17]:
result = tune.run(
    partial(perform_experiment),
    resources_per_trial={
        "cpu": 6/parallel_runs, 
        "gpu": 1/parallel_runs},
    config=config,
    num_samples=150,
    scheduler=scheduler,
    progress_reporter=reporter,
    name='full_run',
    log_to_file=True,
    verbose=2)

Trial name,status,loc,steps,if_trained,win_ratio,episode_reward_mean,steps.1
DEFAULT_08e8c1d1,TERMINATED,127.0.0.1:11456,,False,0.0,-0.01,30000
DEFAULT_08e84cf0,TERMINATED,127.0.0.1:17124,,False,0.0,-0.01,30000
DEFAULT_08e7b11f,TERMINATED,127.0.0.1:12152,,False,0.0,-0.0298,30000
DEFAULT_08e67972,TERMINATED,127.0.0.1:17064,,False,0.0,-0.01,30000
DEFAULT_08e62b89,TERMINATED,127.0.0.1:1756,,False,0.0,-0.01,30000
DEFAULT_08e6c75c,TERMINATED,127.0.0.1:3588,,False,0.0,-0.01,30000
DEFAULT_08e5dd9c,TERMINATED,127.0.0.1:14876,,False,0.0,-0.01,30000
DEFAULT_08e76334,TERMINATED,127.0.0.1:2120,,False,0.0,-0.9604,30000
DEFAULT_08e51ad2,TERMINATED,127.0.0.1:5332,,False,0.0,-0.01,30000
DEFAULT_08e4a5f2,TERMINATED,127.0.0.1:15236,,False,0.0,-0.01,30000


2022-05-26 04:24:03,508	INFO tune.py:626 -- Total run time: 21479.86 seconds (21479.32 seconds for the tuning loop).


In [15]:
res_df = result.dataframe()
res_df = res_df[['if_trained', 'win_ratio', 'episode_reward_mean', 'steps','config/gamma',
       'config/lr', 'config/n_hidden_layers', 'config/random_scaling']]

res_df.sort_values(['win_ratio'], ascending=False)[:20]

Unnamed: 0,if_trained,win_ratio,episode_reward_mean,steps,config/gamma,config/lr,config/n_hidden_layers,config/random_scaling
35,True,0.71,0.42,5081,0.929049,0.008325,1,0.999778
66,False,0.41,-0.18,30000,0.267731,3.6e-05,1,0.99962
83,False,0.4,-0.2,30000,0.285017,0.001,1,0.999745
139,False,0.36,-0.2602,30000,0.641106,1.3e-05,1,0.99963
74,False,0.28,-0.44,30000,0.316274,0.00048,1,0.999928
69,False,0.16,-0.68,30000,0.156376,6e-05,1,0.999633
68,False,0.15,-0.7,30000,0.193105,0.000463,1,0.999901
127,False,0.11,-0.78,30000,0.241511,0.014428,2,0.99971
109,False,0.11,-0.78,30000,0.253586,2.8e-05,1,0.99979
147,False,0.09,-0.82,30000,0.156308,0.018064,2,0.999751


## Save results

In [None]:
base_results_df_path = os.path.join("..", "results", "auto_hp_tuning")
trial_num = 0
df_saved = False

while not df_saved:
  results_df_path = os.path.join(base_results_df_path, "results_"+str(trial_num)+".csv")
  if os.path.exists(results_df_path):
    trial_num+=1
  else:
    res_df.to_csv(results_df_path)
    df_saved = True