In [2]:
# imports

#neural net imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#math imports
import numpy as np
import random
import cv2
import matplotlib.pyplot as plt
from collections import deque
import sklearn as skl


#import elpv-dataset-1.0
from elpvdataset.utils.elpv_reader import load_dataset

#quality of life imports
import keyboard
import pickle

In [3]:
# create train and test sets

# the dataset is split into 3:
# test: 25% of the dataset used for final evaluation of the mode.

# train: 75% of the dataset used for training the model.
# this is further split into 2:
# agent train: 75% of the train set used for training the agent.
# agent test: 25% of the train set used for evaluating the agent during training and generating a reward.
import sklearn.model_selection

images, probs, types = load_dataset()
dataset = np.array(list(zip(images, types)))

# create a define for style when accessing image or type
IMAGE_IDX = 0
TYPE_IDX = 1

PROBS_IDX = 0      # this line really doesn't have to exist but it can stay.

# randomly split into train and test sets.
x_train, x_test, y_train, y_test = skl.model_selection.train_test_split(dataset, probs, random_state = 42, test_size = (25/100), train_size = 75/100, shuffle = True)
print("test set size: ", y_test.shape)
# furthur split train set.
x_agent_train, x_agent_test, y_agent_train, y_agent_test = skl.model_selection.train_test_split(x_train, y_train, random_state = 42, test_size = (25/100), train_size = 75/100, shuffle = False)
print("agent train set size: ", y_agent_train.shape)
print("agent test set size: ", y_agent_test.shape)

# creating a few more defines to suit neural net requirements.

NUM_TRAIN_OUTPUTS = x_agent_train.shape[0]

test set size:  (656,)
agent train set size:  (1476,)
agent test set size:  (492,)


  del sys.path[0]


The agent is a deep Q learning agent that utilises a neural net to estimate the value of feeding the CNN an image given a certain state.

It will "play" a game of sequentially feeding the CNN images selectively with the goal of finding the optimal performance of the CNN while preventing overfitting.

The Q learning agent will also utilise and LSTM to remember its previous actions and make decisions based on that.

In [25]:
#initialize agent neural net

import torch
import torch.nn as nn
import torch.optim as optim

class Network(nn.Module):
    def __init__(self, lr):
        super(Network, self).__init__()


        self.lstm_input = 3 # 1 for image, 1 for type, 1 for reward ().
        self.lstm_output = 64
        self.inputs = self.lstm_output  
        self.hid_1 = 500
        self.hid_2 = 500
        self.n_actions = NUM_TRAIN_OUTPUTS

        #create LSTM module
        self.lstm = nn.LSTM(self.lstm_input, self.lstm_output, 1, batch_first=True)
        
        # create neural net
        self.model = nn.Sequential(
            nn.Linear(self.inputs, self.hid_1),
            nn.ReLU(),
            nn.Linear(self.hid_1, self.hid_2),
            nn.ReLU(),
            nn.Linear(self.hid_2, self.n_actions),
        )

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.SmoothL1Loss()
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

        self.h = torch.zeros(1, 1, self.lstm_output).to(self.device)
        self.c = torch.zeros(1, 1, self.lstm_output).to(self.device)

    def forward(self, x, zero=False):
        if zero:
            self.h = torch.zeros(1, x.size(0), self.lstm_output).to(self.device)
            self.c = torch.zeros(1, x.size(0), self.lstm_output).to(self.device)
        x = x.type(torch.FloatTensor).to(self.device)
        print("input: ", x.view(len(x), 1, -1))
        lstm_output, (self.h, self.c) = self.lstm(x.view(1, 1, 3), (self.h, self.c))  #define strict lstm input of [[image, type, reward]]
        output = self.model(lstm_output.view(len(x), -1))
        return output, (self.h, self.c)


In [26]:
lstm = Network(0.001)
with torch.no_grad():
    input = [[1, 2, 3]]
    inputs = torch.tensor(input)
    # expected_outputs = torch.tensor(y_agent_train)
    output, (_, _) = lstm(inputs)
    # print(output)
    print(torch.max(output, dim = 1))

input:  tensor([[[1., 2., 3.]]])
torch.return_types.max(
values=tensor([0.0874]),
indices=tensor([1424]))


In [None]:
class Agent(object):
    def __init__(self):
        """
        Porperties:
            gamma (float): Future reward discount rate.
            epsilon (float): Probability for choosing random policy.
            epsilon_decay (float): Rate at which epsilon decays toward zero.
            learning_rate (float): Learning rate for Adam optimizer.

        Returns:
            Agent
        """
        # constant parameters
        self.gamma = 0.95
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.996
        self.lr = 0.00005
        self.batch_size = 64
        self.max_mem_size = 50000
        # self.input_dims = 7 * 4

        #variable parameters
        self.epsilon = 0.01
        self.mem_cntr = 0
        self.mem_cntr_successful = 0

        # initializing memory
        self.memory = deque(maxlen=self.max_mem_size)
        self.memory_successful = deque(maxlen=1000)
        self.episodic_memory = []

        #initialize networks
        self.network = Network(self.lr)

    def getMemory(self):
        return self.memory

    def nextEpisode(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

    def getepsilon(self):
        return self.epsilon

    def remember(self, state, action, reward, next_state, terminal):
        if (self.mem_cntr >= self.max_mem_size - 2):
            for i in range(self.max_mem_size - 3000):
                self.memory.popleft()
            self.mem_cntr = len(self.memory) - 1

        memory = [state, action, reward, next_state, terminal]
        self.memory.append(memory)

        self.mem_cntr += 1

    def select_action(self, state):
        if np.random.rand() <= self.epsilon:
            # exploration
            determiner = np.random.randint(0, 30);
            if (determiner <= 2):
                return 1
            return 0
        else:
            # exploitation, pick epsilon greedy
                state_tensor = torch.tensor([state]).to(self.network.device, dtype=torch.int32)
                action = torch.argmax(self.network.forward(state_tensor)).item()
                
        return action
    
    def updateEpsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        
    def updateEpsilonScore(self, score):
        modifier = -0.01*score + 1.08
        episolon_new = min(self.epsilon * modifier, 0.7 )
        self.epsilon = max(self.epsilon_min, episolon_new)

    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        

        self.network.optimizer.zero_grad()
        max_mem = min(self.mem_cntr, self.max_mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)

        batch_index = np.arange(self.batch_size, dtype=np.int32)

        # memory = [state, action, reward, next_state, game_over, score, next_reward]



        state_batch = torch.tensor([self.memory[i][0] for i in batch]).to(self.network.device, dtype=torch.float32)
        action_batch = torch.tensor([self.memory[i][1] for i in batch])
        reward_batch = torch.tensor([self.memory[i][2] for i in batch]).to(self.network.device, dtype=torch.float32)
        new_state_batch = torch.tensor([self.memory[i][3] for i in batch]).to(self.network.device, dtype=torch.float32)
        game_over_batch = torch.tensor([self.memory[i][4] for i in batch]).to(self.network.device, dtype=torch.bool)
        next_reward_batch = torch.tensor([self.memory[i][6] for i in batch]).to(self.network.device, dtype=torch.float32)

        q_current = self.network.forward(state_batch)[batch_index, action_batch]
        # q_next = self.network.forward(new_state_batch)
        # q_next[game_over_batch] = 0.0

        # max returns value as well as index, we only require index
        # q_current = reward_batch
        # q_target = next_reward_batch

        q_current = self.network.forward(state_batch)[batch_index, action_batch]
        q_next = self.network.forward(new_state_batch)

        # ask tutor how to make this part not short sighted.
        q_target = reward_batch + self.gamma * torch.max(q_next, dim=1)[0]


        loss = self.network.loss(q_target, q_current).to(self.network.device)
        loss.backward()
        torch.nn.utils.clip_grad_value_(self.network.parameters(), 100)
        self.network.optimizer.step()

    def update_episodic_memory(self, state, action, reward, next_state, done, current_step):
        self.episodic_memory.append([state, action, reward, next_state, done, 0])