In [1]:
import time
from math import sqrt
import matplotlib.pyplot as plt
import random
from env6 import Env
import os
import numpy as np
import torch
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader
from copy import deepcopy

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class ExperienceDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __len__(self):
        return len(self.data[0])

    def __getitem__(self, ind):
        return tuple(map(lambda t: t[ind], self.data))

def create_shuffled_dataloader(data, batch_size,shuffle=True):
    ds = ExperienceDataset(data)
    return DataLoader(ds, batch_size = batch_size, shuffle = shuffle)

In [3]:
class PolicyNetwork(torch.nn.Module):
    def __init__(self,state_dims, n_actions, hidden_size=64, num_layers=1):
        super(PolicyNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = torch.nn.LSTM(state_dims, hidden_size, num_layers, batch_first=True)
        self.fc1 = torch.nn.Linear(hidden_size, 32)
        torch.nn.init.kaiming_uniform_(self.fc1.weight)
        self.fc2 = torch.nn.Linear(32, n_actions)
        torch.nn.init.kaiming_uniform_(self.fc2.weight)


    def forward(self, x, hidden_state=None):
        # Initialize the hidden state if not provided
        if hidden_state is None:
            h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
            c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
            hidden_state = (h_0, c_0)
        
        # Pass through LSTM layer
        lstm_out, hidden_state = self.lstm(x, hidden_state)
        
        # Take the output of the last LSTM cell
        lstm_out = lstm_out[:, -1, :]

        actor_out = torch.tanh(self.fc1(lstm_out))
        actor_out = torch.tanh(self.fc2(actor_out))
        policy = torch.nn.functional.softmax(actor_out, dim=-1) 

        return policy, hidden_state
    

class ValueNetwork(torch.nn.Module):
    def __init__(self,state_dims, hidden_size=64, num_layers=1):
        super(ValueNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = torch.nn.LSTM(state_dims, hidden_size, num_layers, batch_first=True)
        self.fc1 = torch.nn.Linear(hidden_size, 32)
        torch.nn.init.kaiming_uniform_(self.fc1.weight)
        self.fc2 = torch.nn.Linear(32, 1)
        torch.nn.init.kaiming_uniform_(self.fc2.weight)


    def forward(self, x, hidden_state=None):
        # Initialize the hidden state if not provided
        if hidden_state is None:
            h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
            c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
            hidden_state = (h_0, c_0)
        
        # Pass through LSTM layer
        lstm_out, hidden_state = self.lstm(x, hidden_state)
        
        # Take the output of the last LSTM cell
        lstm_out = lstm_out[:, -1, :]

        critic_out = torch.tanh(self.fc1(lstm_out))
        value = self.fc2(critic_out)

        return value, hidden_state
        
def weights_init(m):
    if isinstance(m,torch.nn.Linear):
        m.bias.data.fill_(0)
        torch.nn.init.kaiming_uniform_(m.weight)

def np_to_tensor(x):
    return torch.tensor(x).to(torch.float32)

    
policy_model = PolicyNetwork(state_dims=6, n_actions = 6)
value_model = ValueNetwork(state_dims=6)

In [4]:
print(policy_model,value_model)
# summary(policy_model, input_size=(1, 6))
# summary(value_model, input_size=(1, 6))

PolicyNetwork(
  (lstm): LSTM(6, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=6, bias=True)
) ValueNetwork(
  (lstm): LSTM(6, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
)


In [5]:
class PPO:
    def __init__(self, state_dim=6, action_dim=6, policy_lr=0.0005, value_lr=0.001, 
                 gamma=0.99, lam=0.95, beta_s=0.01,epsilon_clip=0.2, value_clip=0.4, 
                 epochs=5,batch_size=50,save_path='Models'):
        
        self.policy_net = PolicyNetwork(state_dim, action_dim).to(device)
        self.value_net = ValueNetwork(state_dim).to(device)
        # self.policy_net.apply(weights_init)
        # self.value_net.apply(weights_init)

        self.policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr)
        self.value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=value_lr)

        self.gamma = gamma
        self.lamda = lam
        self.epsilon_clip = epsilon_clip
        self.value_clip = value_clip
        self.beta_s = beta_s
        self.epochs = epochs
        self.action_dim = action_dim
        self.batch_size = batch_size

        self.env = Env(port=5555)
        self.env.run()

        self.save_path = save_path
        self.env_name = "auto-docking-iss"

        if not os.path.exists(self.save_path): os.makedirs(self.save_path)
        self.path = str(self.env_name)+'_PPO_'
        self.model_name = os.path.join(self.save_path, self.path)

        self.trajectory = {'states':[],'actions':[], 'rewards':[], 'action_logs':[],
                           'values':[],'done':[]}
        
        self.scores, self.episodes, self.average = [],[],[]
        self.value_loss_mean, self.policy_loss_mean = [], []

    def save(self,episode,score):
        torch.save({
            'actor': self.policy_net.state_dict(),
            'critic': self.value_net.state_dict()
        }, f'./'+ self.model_name + '_' + str(episode) + '_' + str(round(score,4)) + '.pt')

    def load(self,name):
        print(f'./{name}')
        try:
            data = torch.load(f'./{name}')
            self.policy_net.load_state_dict(data['actor'])
            self.value_net.load_state_dict(data['critic'])
        except Exception as e:
            print(e)

    # Función para visualizar la evolución del entrenamiento
    def plot_model(self, score, episode,num_episodes):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-10:]) / len(self.scores[-10:]))
        if (episode % 10 == 0 and episode > 0) or episode == num_episodes:#str(episode)[-2:] == "00":# much faster than episode % 100
            plt.figure(1,figsize=(18, 9))
            plt.plot(self.episodes, self.scores, 'b')
            plt.plot(self.episodes, self.average, 'r')
            plt.ylabel('Score', fontsize=18)
            plt.xlabel('Steps', fontsize=18)
            try:
                plt.savefig(self.model_name+"scores.png")
            except OSError:
                pass

        return self.average[-1]
    
    def plot_loss(self,episode,num_episodes):        
        if (episode % 10 == 0 and episode > 0) or episode == num_episodes:#str(episode)[-2:] == "00":# much faster than episode % 100
            plt.figure(2,figsize=(18, 9))
            plt.plot(self.episodes, self.policy_loss_mean, 'b')
            plt.plot(self.episodes, self.value_loss_mean, 'r')
            plt.ylabel('Loss', fontsize=18)
            plt.xlabel('Steps', fontsize=18)
            try:
                plt.savefig(self.model_name+"loss.png")
            except OSError:
                pass
    

    #Almacenamiento de todas las variables que definen una transición para PPO
    def collect_trayectory(self,state,action,action_logs,reward,done,value):
        self.trajectory['states'].append(state)
        self.trajectory['actions'].append(action)
        self.trajectory['rewards'].append(reward)
        self.trajectory['values'].append(value)
        self.trajectory['action_logs'].append(action_logs)
        self.trajectory['done'].append(done)
    
    def clear_trayectory(self):
        for key in self.trajectory.keys():
            self.trajectory[key].clear()

    
    def generalized_advantage_estimation(self,next_state):
        advantages = np.zeros_like(self.trajectory['rewards'])
        last_gae_lambda = 0
        value,hidden_state = self.value_net(next_state.unsqueeze(0).unsqueeze(0),None)
        self.trajectory['values'].append(value.squeeze().detach())

        for t in reversed(range(len(self.trajectory['rewards']))):
            delta = self.trajectory['rewards'][t]+ self.gamma * self.trajectory['values'][t + 1].item() - self.trajectory['values'][t].item()
            advantages[t] = last_gae_lambda = delta + self.gamma * self.lamda * last_gae_lambda
        
        return np_to_tensor(advantages)


    def learn_from_episode_trajectory(self,next_state):
        self.policy_net.train()
        self.value_net.train()

        states = deepcopy(self.trajectory['states'])
        actions = deepcopy(self.trajectory['actions'])
        # rewards = deepcopy(self.trajectory['rewards'])
        values = deepcopy(self.trajectory['values'])
        action_logs = deepcopy(self.trajectory['action_logs'])
        # done = deepcopy(self.trajectory['done'])

        advantages = self.generalized_advantage_estimation(next_state).to(device)

        # prepare dataloader for policy phase training
        dl = create_shuffled_dataloader([states, actions, action_logs, advantages, values], self.batch_size,shuffle=False)

        policy_loss_array = []
        value_loss_array = []

        for i in range(self.epochs):
            policy_hidden_state = None
            value_hidden_state = None
            for states, actions, action_log, advantages, values in dl:
                #Update policy_net
                new_action_probs,policy_hidden_state = self.policy_net(states.unsqueeze(0),policy_hidden_state)
                new_action_probs = new_action_probs.squeeze()
                dist = torch.distributions.Categorical(new_action_probs)

                # print("")
                # print(actions)
                # print(new_action_probs)
                new_action_logs = dist.log_prob(actions)
                entropy = dist.entropy()

                # compute PPO-Clip
                ratios = torch.exp(new_action_logs - action_log)
                surr1 = ratios * advantages
                surr2 = ratios.clamp(1 - self.epsilon_clip, 1 + self.epsilon_clip) * advantages
                policy_loss = -torch.mean(torch.min(surr1, surr2) - self.beta_s * entropy)
                policy_loss_array.append(policy_loss.detach().numpy())

                self.policy_optimizer.zero_grad() # Clean gradients
                policy_loss.mean().backward() # Compute gradients
                self.policy_optimizer.step() #Update weights

                #Update value_net
                new_values,value_hidden_state = self.value_net(states.unsqueeze(0),value_hidden_state)
                new_values = new_values.squeeze()
                value_clipped = values + (new_values - values).clamp(-self.value_clip, self.value_clip)
                value_loss_1 = (value_clipped.flatten() - advantages) ** 2
                value_loss_2 = (values.flatten() - advantages) ** 2
                value_loss = torch.mean(torch.max(value_loss_1, value_loss_2))
                value_loss_array.append(value_loss.detach().numpy())

                self.value_optimizer.zero_grad() # Clean gradients
                value_loss.backward() # Compute gradients 
                self.value_optimizer.step() #Update weights

        self.value_loss_mean.append(sum(value_loss_array)/self.epochs)
        self.policy_loss_mean.append(sum(policy_loss_array)/self.epochs)

        print(f"\n Mean Policy loss: {self.value_loss_mean[-1]:.4f}. Mean Value loss: {self.policy_loss_mean[-1]:.4f}")
        
        self.policy_net.eval()
        self.value_net.eval()


    # Función para resetear el entorno tras acabar trayectoria
    def reset(self):
        state = self.env.reset()
        return np_to_tensor(self.process_state(state))
    
     # Función para interacción agente-entorno
    def step(self, action):
        next_state, reward, done = self.env.step(action)
        next_state = self.process_state(next_state)
        return np_to_tensor(next_state), reward, done

    def process_state(self,state):
        state_processed = np.array(state)
        state_processed[0] = state_processed[0] / 30 # Position array
        state_processed[1] = state_processed[1] / 3 # Position array
        state_processed[2] = state_processed[2] / 3 # Position array
        return state_processed
    
    def train(self,num_episodes=200,max_steps=200):
        average = 0
        self.policy_net.eval()
        self.value_net.eval()

        max_score_average = -5000

        for episode in range(1,num_episodes+1):
            self.clear_trayectory()

            state = self.reset()
            score = 0

            SAVING = ""
            policy_hidden_state = None
            value_hidden_state = None

            for step in range(1,max_steps+1):
                
                action_probs,policy_hidden_state = self.policy_net(state.unsqueeze(0).unsqueeze(0),policy_hidden_state)
                action_probs = action_probs.squeeze().detach()
                dist = torch.distributions.Categorical(action_probs)
                action = dist.sample()
                action_log = dist.log_prob(action)

                value,value_hidden_state = self.value_net(state.unsqueeze(0).unsqueeze(0),value_hidden_state)
                value = value.squeeze().detach()
                
                next_state,reward,done = self.step(action.item())    
                self.collect_trayectory(state,action,action_log,reward,done,value)

                state = next_state

                score += reward

                # print(state)
                print(f'\rSteps: {step}/{max_steps}. Action: {action}. Reward/Value: {round(reward,2)}/{round(value.item(),4)}. Done: {done}', end='', flush=True)

                if done:
                    break
                
                time.sleep(1.0)

            self.learn_from_episode_trajectory(next_state)
            self.clear_trayectory()

            average = self.plot_model(score, episode,num_episodes)
            self.plot_loss(episode,num_episodes)
            
            if average >= max_score_average and episode > 5:
                max_score_average = average
                self.save(episode,average)
                SAVING = "SAVING"
            elif episode % 10 == 0:
                self.save(episode,average)
                SAVING = "SAVING"
            else:
                SAVING = ""
            
            print(f"Episode: {episode}/{num_episodes}, score/average: {score:.4f}/{average:.4f} {SAVING}\n")


        # close environemnt when finish training
        self.save(episode,average)

    def test(self,max_steps):
        self.policy_net.eval()
        self.value_net.eval()

        self.clear_trayectory()
        state = self.reset()
        score = 0

        start = time.time()
        end = time.time()
        step = 0

        while True:
            end = time.time()
            if end - start >= 1.0:
                step += 1
                action_probabilities = self.policy_net(state)
                action = torch.argmax(action_probabilities, dim=-1).item()#np.argmax(self.policy_net(state).squeeze().detach().numpy())
                value = self.value_net(state).squeeze().detach()
                
                next_state,reward,done = self.step(action)    

                state = next_state

                score += reward

                # print(action_probabilities)
                print(f'\rSteps: {step}/{max_steps}. Action: {action}. Score/Value: {round(score,2)}/{round(value.item(),4)}. Done: {done}', end='', flush=True)
                start = time.time()

            if self.env.done or step > max_steps:
                break

        state_end = self.env.get_state()
        
        print(f'\nNext state: {state_end}')


In [6]:
agent = PPO(save_path='Models10')

(11477) wsgi starting up on http://127.0.0.1:5555


(11477) accepted ('127.0.0.1', 58882)
(11477) accepted ('127.0.0.1', 58898)
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET / HTTP/1.1" 304 243 0.006165
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET /css/normalize.css HTTP/1.1" 304 201 0.000765
(11477) accepted ('127.0.0.1', 58904)
(11477) accepted ('127.0.0.1', 58908)
(11477) accepted ('127.0.0.1', 58914)
(11477) accepted ('127.0.0.1', 58920)
(11477) accepted ('127.0.0.1', 58932)
(11477) accepted ('127.0.0.1', 58948)
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET /css/layout.css HTTP/1.1" 304 198 0.000494
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET /js/three/three.js HTTP/1.1" 304 196 0.000380
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET /js/three/WebGL.js HTTP/1.1" 304 196 0.000334
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET /js/three/GLTFLoader.js HTTP/1.1" 304 201 0.000448
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET /js/three/CSS2DRenderer.js HTTP/1.1" 304 204 0.000487
127.0.0.1 - - [10/Aug/2024 14:07:17] "GET /js/gsap3/gsap.js HTTP/1.1" 304 195 0.000435


In [7]:
agent.train(num_episodes=200,max_steps=250)

Reset
Steps: 146/250. Action: 5. Reward/Value: -201.41/-0.1787. Done: -1

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

127.0.0.1 - - [10/Aug/2024 14:10:19] "GET /socket.io/?EIO=4&transport=websocket&sid=CdGBqEBomE8L17xIAAAA HTTP/1.1" 200 0 182.493003
(11477) accepted ('127.0.0.1', 46412)
127.0.0.1 - - [10/Aug/2024 14:10:19] "GET / HTTP/1.1" 304 243 0.000824
127.0.0.1 - - [10/Aug/2024 14:10:19] "GET /css/normalize.css HTTP/1.1" 304 201 0.000705
(11477) accepted ('127.0.0.1', 46420)
(11477) accepted ('127.0.0.1', 46428)
(11477) accepted ('127.0.0.1', 46436)
(11477) accepted ('127.0.0.1', 46440)
127.0.0.1 - - [10/Aug/2024 14:10:19] "GET /css/layout.css HTTP/1.1" 304 198 0.000476
127.0.0.1 - - [10/Aug/2024 14:10:19] "GET /js/three/three.js HTTP/1.1" 304 196 0.000516
127.0.0.1 - - [10/Aug/2024 14:10:19] "GET /js/three/WebGL.js HTTP/1.1" 304 196 0.000338
127.0.0.1 - - [10/Aug/2024 14:10:19] "GET /js/three/GLTFLoader.js HTTP/1.1" 304 201 0.000403
(11477) accepted ('127.0.0.1', 46442)
(11477) accepted ('127.0.0.1', 46450)
(11477) accepted ('127.0.0.1', 46460)
127.0.0.1 - - [10/Aug/2024 14:10:19] "GET /js/three

In [None]:
agent.load("Models8/auto-docking-iss_PPO__148_-226.6446.pt")


In [None]:
agent.test(500)

In [None]:
import os
import pyautogui

# Simular pulsación de Ctrl+S
pyautogui.hotkey('ctrl', 's')

os.system('sudo shutdown')
# os.system('sudo shutdown -c')