In [None]:
# !pip install import_ipynb --quiet
# !git clone https://github.com/gmshroff/aiagentarch.git
# %cd aiagentarch

In [None]:
import gym
from gym import spaces
import random
import numpy as np
import torch
import sys

In [None]:
import import_ipynb
import utils
import models

### World and Agents for Supervised Learning Tasks

In [None]:
from aiagentbase import AIAgent,Controller,Memory,Perception,Actor

In [None]:
class SLWorld():
    def __init__(self,train_ds,test_ds,n_classes):
        self.train_ds=train_ds
        self.test_ds=test_ds
        self.action_space=spaces.Discrete(n_classes)
        self.obs_dim=self.train_ds[0][0].shape[-1]
        high = np.inf*np.ones(self.obs_dim)
        low = -high
        self.observation_space=spaces.Box(high=high,low=low)
    def run(self,agent=None,n_episodes=10):
        self.test_rew=0
        self.test_rewL=[]
        if 'training' not in agent.__dict__: agent.training=True
        for episode in range(n_episodes):
            tot_rew=0
            agent.begin()
            count=0
            for sample,label in self.train_ds:
                count+=1
                done=(count==len(train_ds))
                action=agent.act(sample)
                reward=(self.accuracy(action,label),{'label':label})
                agent.reward((reward[0],done,reward[1]))
                tot_rew+=reward[0]
            if 'end' in dir(agent): agent.end()
            print('episode: ',episode,'avg reward: ',tot_rew/len(train_ds))
        agent.set_training(False)
        print('Training Over')
        agent.begin()
        for sample,label in self.test_ds:
            action=agent.act(sample)
            reward=(self.accuracy(action,label),{})
            agent.reward(reward)
            self.test_rewL+=[reward]
            self.test_rew+=reward[0]
        print('Test Over; Accuracy: ',self.test_rew/len(self.test_ds))
        return self.test_rew/len(self.test_ds)
    def accuracy(self,action,label):
        if (type(action)==np.ndarray): action=action[-1]
        if action==label[-1]: return 1
        else: return 0

In [None]:
class MLPAgent(AIAgent):
    def __init__(self,action_space,net):
        super().__init__()
        ##Augmenting AIAgent
        self.actor=self.Actor(parent=self,model=net)
        self.action_space=action_space
        self.tot_rew=0
        self.rewL=[]
        
    class Actor(Actor):
        def __init__(self,parent,model):
            super().__init__(parent=parent,model=model)
        def call_model(self,state):
            ##Overriding AIAgent
            lpreds=self.model(state)
            action=torch.argmax(lpreds,axis=1)
            return action
        def compute_reward(self,reward):
            return reward[0]

    def set_training(self,value):
        self.training=value
    def avg_rew(self):
        return sum(self.rewL)/len(self.rewL)
    def reward(self,rew):
        ##Augmenting AIAgent
        if self.training:
            prev_state=self.memory.sar_memory[self.time-1]['state']
            net=self.actor.model
            action=torch.argmax(net(prev_state))
            prev_action=self.memory.sar_memory[self.time-1]['action']
            net,_,_=models.Train(net,[(prev_state,rew[2]['label'])],epochs=1)
        self.tot_rew+=rew[0]
        return super().reward(rew)
    def begin(self):
        ##Augmenting AIAgent
        self.rewL+=[self.tot_rew]
        super().begin()

In [None]:
class MLPBatchAgent(MLPAgent):
    def __init__(self,action_space,net,epochs=1):
        super().__init__(action_space,net)
        self.epochs=epochs
    def reward(self,rew):
        self.tot_rew+=rew[0]
        # Bypass parent class' reward
        return super(MLPAgent,self).reward(rew) 
    def end(self):
        if not self.training: return
        #Gather data from sar memory
        print('Agent Training')
        M=self.memory.sar_memory
        P=self.memory.perceptual_memory
        y=[P[t]['reward'][2]['label'] 
           for t in P if t>=0 and 'label' in P[t]['reward'][2]]
        X=[M[t]['state'] for t in M if t>=0][0:len(y)]
        #Train 
        self.net,_,_=models.Train(net,[(x,l) for x,l in zip(X,y)],epochs=self.epochs,verbose=True)
        return

In [None]:
train_ds, test_ds, dloader = utils.euclideanDataset(n_samples=10000,n_features=20,n_classes=10,batch_size=32)

In [None]:
train_ds=[(s.unsqueeze(0),l.unsqueeze(0)) for s,l in train_ds]

In [None]:
test_ds=[(s.unsqueeze(0),l.unsqueeze(0)) for s,l in test_ds]

In [None]:
net=models.MLP(dims=[20,32,10])

In [None]:
# net,_,_=models.Train(net,train_ds,epochs=5,verbose=True)

In [None]:
slworld=SLWorld(train_ds,test_ds,n_classes=10)

In [None]:
agent=MLPAgent(slworld.action_space,net)

In [None]:
agent=MLPBatchAgent(slworld.action_space,net)

In [None]:
slworld.run(agent=agent,n_episodes=1)

In [None]:
M=agent.memory.perceptual_memory

In [None]:
[(t,M[t]) for t in M if M[t]['reward'][1]==True]

In [None]:
S=agent.memory.sar_memory

In [None]:
S[1]

In [None]:
agent.set_training(False)

### Supervised-Learning Enviroment trained using off-the shelf RL

In [None]:
class SLEnv(gym.Env):
    def __init__(self,ds,n_classes,batch_size=1):
        self.ds=ds
        self.n=len(ds)
        self.obs_dim=self.ds[0][0].shape[-1]
        self.action_space=spaces.Discrete(n_classes)
        high = np.inf*np.ones(self.obs_dim)
        low = -high
        self.observation_space=spaces.Box(high=high,low=low)
        self.counter=0
        self.verbose=False
        self.ep_reward=0
        self.epoch=0
        self.batch_size=batch_size
    def set_verbose(self,value):
        self.verbose=value
    def reset(self):
        # self.counter=0
        return self.ds[self.counter][0]
    def step(self,action):
        state=self.ds[self.counter][0]
        label=self.ds[self.counter][1]
        # print(action,label)
        reward=self.accuracy(action,label)
        self.ep_reward+=reward
        # print(reward)
        if self.counter==self.n-1:
            self.counter=0
            if self.verbose: print(f"epoch {self.epoch} avg_reward {self.ep_reward/self.n}")
            self.ep_reward=0
            self.epoch+=1
        else: self.counter+=1
        if self.counter%self.batch_size==0:done=True
        else:done=False
        state=self.ds[self.counter][0]
        return state,reward,done,{}
    def accuracy(self,action,label):
        if (type(action)==np.ndarray): action=action[-1]
        if action==label[-1]: return 1
        else: return 0

In [None]:
slenv=SLEnv(train_ds,10,batch_size=1)
slenv.set_verbose(True)

In [None]:
for ep in range(int(slenv.n/slenv.batch_size)):
    done=False
    while not done:
        state,reward,done,_=slenv.step(slenv.action_space.sample())

In [None]:
model=A2C('MlpPolicy', slenv, verbose=0, gamma=1.0)

In [None]:
model=PPO('MlpPolicy', slenv, verbose=0, gamma=1.0)

In [None]:
model=DQN('MlpPolicy', slenv, verbose=0, gamma=1.0)

In [None]:
model.learn(total_timesteps=50000)

In [None]:
tsenv=SLEnv(test_ds,10,batch_size=len(train_ds))

In [None]:
env=tsenv
# env=slenv
state=env.reset()
tot_rew=0
rewL=[]
count=0
for ep in range(1):
    done=False
    while not done:
        action,_=model.predict(state)
        # print(action,slenv.ds[slenv.counter][1])
        state,reward,done,_=env.step(action)
        # print(reward)
        tot_rew+=reward
        count+=1
        rewL+=[reward]
print(tot_rew/count)

### Training an AI Agent's Model using Generic AI Agent

In [None]:
from queue import Queue
from threading import Thread
import threading

In [None]:
from aiagentbase import RLAgent
from stable_baselines3 import PPO,DQN,A2C,SAC
from threading import Thread
import threading

In [None]:
agent=RLAgent(A2C,slworld.action_space,slworld.observation_space)

In [None]:
agent.debug=False
agent.use_memory=True

In [None]:
agent.rewL=[]
agent.tot_rew=0

In [None]:
world=slworld

In [None]:
worldthread=Thread(name='world',target=world.run,args=(agent,10))

In [None]:
agent.start(training_steps=2000)

In [None]:
worldthread.start()

In [None]:
# world.run(agent=agent,n_episodes=3)

In [None]:
agent.logL

In [None]:
from matplotlib import pyplot as plt

In [None]:
np.gradient(agent.rewL).mean()

In [None]:
plt.plot(np.gradient(agent.rewL))

In [None]:
for thread in threading.enumerate(): 
    print(thread.name)

In [None]:
agent.memory.perceptual_memory[0]

In [None]:
agent.memory.sar_memory[0]

In [None]:
train_ds[0]