In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import matplotlib.pyplot as plt
import warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch.utils.data

import matplotlib.pyplot as plt
import os

In [2]:
data_df = pd.read_pickle('datasets/all_players_rel_all_actions_group_team.pkl')

data_df.loc[:, 'next_action'] = data_df.groupby('playIndex').action.shift(-1)

ac_df = data_df.dropna(axis=0, how='any', inplace=False)

ac_df.head()

Unnamed: 0,playIndex,timeIndex,state,next_state,reward,action,next_action
0,0,48,"[11.4, 42.67, 0.2, 3.07, 102.1, 312.17, 12.42,...","[11.39, 42.66, 0.21, 2.84, 105.48, 199.45, 11....",0.01,"[0.01, 0.01]","[-0.02, 0.03]"
1,0,49,"[11.39, 42.66, 0.21, 2.84, 105.48, 199.45, 11....","[11.41, 42.63, 0.47, 2.53, 113.86, 168.36, 11....",0.04,"[-0.02, 0.03]","[-0.04, 0.07]"
2,0,50,"[11.41, 42.63, 0.47, 2.53, 113.86, 168.36, 11....","[11.45, 42.56, 0.83, 2.53, 110.69, 158.32, 11....",0.08,"[-0.04, 0.07]","[-0.08, 0.11]"
3,0,51,"[11.45, 42.56, 0.83, 2.53, 110.69, 158.32, 11....","[11.53, 42.45, 1.32, 3.03, 117.3, 148.06, 11.1...",0.14,"[-0.08, 0.11]","[-0.09, 0.13]"
4,0,52,"[11.53, 42.45, 1.32, 3.03, 117.3, 148.06, 11.1...","[11.62, 42.32, 1.7, 3.13, 114.42, 144.84, 11.5...",0.16,"[-0.09, 0.13]","[-0.16, 0.16]"


In [3]:
'''
Get max action which is used in soft AC method
'''

action_mat = np.stack(ac_df.action)
max_action = np.max(action_mat, 0)
mean_action = np.mean(action_mat, 0)
print(f"mean: {mean_action}")
print(f"max: {max_action}")

mean: [0.01647861 0.00798887]
max: [1.59 2.34]


In [4]:
'''
Function that splits the data into a training, validation, and test set
'''
def split_data(dataset, train_split, seed):
    np.random.seed(seed)
    indices = list(range(len(dataset)))
    np.random.shuffle(indices)

    train_num = int(len(dataset)*train_split)
    val_num = (len(dataset) - int(len(dataset)*train_split))//2

    train_indices = indices[0:train_num]
    val_indices = indices[train_num:train_num+val_num]
    test_indices = indices[train_num+val_num:]

    #check to make sure slices correct
    assert len(dataset) == len(train_indices) + len(val_indices) + len(test_indices)

    #dataset = help.normalize(train_indices, dataset)

    train_data = dataset.iloc[train_indices,:]
    val_data = dataset.iloc[val_indices,:]
    test_data = dataset.iloc[test_indices,:]

    return train_data, val_data, test_data

train_data, val_data, test_data = split_data(ac_df, 0.7, 2430)

print(test_data.columns)

print(f"Length of training data: {len(train_data)}")

Index(['playIndex', 'timeIndex', 'state', 'next_state', 'reward', 'action',
       'next_action'],
      dtype='object')
Length of training data: 154821


In [5]:
def test_loop(test_df, update_actor, update_critic, test_loss_fn, device):
    size = len(test_df)
    test_q_loss = 0
    test_a_loss = 0
    update_actor.eval()
    update_critic.eval()
    
    with torch.no_grad():
        for row_index in range(0,len(test_df)):
            #try:
            row = test_df.iloc[row_index, :]

            state = torch.tensor(row['state'], dtype=torch.float32).to(device=device)
            action = torch.tensor(row['action'], dtype=torch.float32).to(device=device)
            reward = torch.tensor(row['reward'], dtype=torch.float32).to(device=device)
            next_state = torch.tensor(row['next_state'], dtype=torch.float32).to(device=device)
            next_action = torch.tensor(row['next_action'], dtype=torch.float32).to(device=device)
            
            # do this because forget to replace nans in next state in dataset construction
            next_state = torch.nan_to_num(next_state, nan=0)
            #if next_state == 0:
            #    continue

            # Update Q
            eval_input = torch.cat((state, action), 0)
            eval_input = torch.unsqueeze(eval_input, 0)
            q_eval = update_critic(eval_input)

            target_input = torch.cat((next_state, next_action), 0)
            target_input = torch.unsqueeze(target_input, 0)
            q_target = reward + gamma*update_critic(target_input)

            q_loss = test_loss_fn(q_eval, q_target)
            test_q_loss += q_loss
    
            # get actor loss
            actor_next_action = update_actor(torch.unsqueeze(state,0))
            next_action = torch.unsqueeze(next_action,0)
            a_loss = test_loss_fn(actor_next_action, next_action)
            test_a_loss += a_loss
    
        test_q_loss /= size
        test_a_loss /= size
            #print(f"Avg loss: {test_loss:>8f} \n")

    return [test_q_loss, test_a_loss]

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

cuda


In [7]:
'''
Define models
'''

'''
Input: Takes in a (state,action) pair
Output: Outputs a Q-score
'''
class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        
        self.layer2_size = (state_size+action_size)*2
        
        #self.batch_initial = nn.BatchNorm1d(state_size+action_size)
        self.batch1 = nn.BatchNorm1d(self.layer2_size)
        self.linear1 = nn.Linear(state_size+action_size, self.layer2_size)
        self.linear2 = nn.Linear(self.layer2_size, 1)
        
        self.relu = nn.ReLU()
        
    def forward(self, inp):
        
        #inp = self.batch_initial(inp)
        
        layer1_output = self.relu(self.batch1(self.linear1(inp)))
        output = self.linear2(layer1_output)
        return output
    
'''
Actor
Input: states
Output: action 
'''
class Actor(nn.Module):
    def __init__(self, state_size, action_size, max_action):
        super(Actor, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.max_action = max_action
        
        self.layer2_size = state_size*2
        self.layer3_size = state_size
        
        self.batch_initial = nn.BatchNorm1d(state_size)
        self.linear1 = nn.Linear(state_size, self.layer2_size)
        self.batch1 = nn.BatchNorm1d(self.layer2_size)
        self.linear2 = nn.Linear(self.layer2_size, self.layer3_size)
        self.batch2 = nn.BatchNorm1d(self.layer3_size)
        self.linear3 = nn.Linear(self.layer3_size, action_size)
        
        self.relu = nn.ReLU()
        
        
    def forward(self, state):
        
        state = self.batch_initial(state)
        layer1_output = self.relu(self.batch1(self.linear1(state)))
        layer2_output = self.relu(self.batch2(self.linear2(layer1_output)))
        output = self.linear3(layer2_output)
        
        output = self.max_action*torch.tanh(output)
        
        return output

##############
### Notes ####
# How to use max action??
##############

    
'''
Define hyperparamters
'''
BATCH_SIZE = 128
iterations = 25000
state_size = len(ac_df.loc[0,'state'])
action_size = 2
gamma = 0.99
max_action = torch.tensor(max_action, dtype=torch.float32).to(device=device)

print(f"State size: {state_size}")
print(f"Action size: {action_size}")

State size: 138
Action size: 2


In [8]:
# create models
update_actor = Actor(state_size, action_size, max_action).to(device=device)
update_critic = Critic(state_size, action_size).to(device=device)

target_actor = type(update_actor)(state_size, action_size, max_action).to(device=device)
target_actor.load_state_dict(update_actor.state_dict())

target_critic = type(update_critic)(state_size, action_size).to(device=device)
target_critic.load_state_dict(update_critic.state_dict())

# define loss function
target_loss_fn = nn.MSELoss()
# define optimizers
actor_optimizer = optim.Adam(update_actor.parameters())
critic_optimizer = optim.Adam(update_critic.parameters())


In [9]:
########
### Use random weights as baseline
########
#untrained_actor = type(target_actor)(state_size, action_size).to(device=device)
#untrained_actor.load_state_dict(untrained_actor.state_dict())

In [10]:
'''
Training loop

Need to add action maximum, else exploding

'''


epochs = 2
break_var = False
training_q_loss_list = []
training_actor_loss_list = []
val_q_loss_list = []
val_actor_loss_list = []

num_batches = int(len(train_data)/BATCH_SIZE)+1

print(f"Num batches: {num_batches}")

COPY_TARGETS_INDEX = int(num_batches/5)

for k in range(epochs):
    
    update_actor.train()
    update_critic.train()
    
    for i in range(num_batches):

        # update weights of target network
        if i % COPY_TARGETS_INDEX == 0:
            target_critic.load_state_dict(update_critic.state_dict())
            target_actor.load_state_dict(update_actor.state_dict())


        #play = sample_from_plays(BATCH_SIZE, train_data, seed=k)
        # get sample from data
        start_index = i*BATCH_SIZE
        end_index = min(len(train_data), (i+1)*BATCH_SIZE)

        play = train_data.iloc[list(range(start_index, end_index)), :]

        state = torch.tensor(np.stack(play['state'].values), dtype=torch.float32).to(device=device)
        action = torch.tensor(np.stack(play['action'].values), dtype=torch.float32).to(device=device)
        reward = torch.tensor(play['reward'].values, dtype=torch.float32).to(device=device)
        next_state = torch.tensor(np.stack(play['next_state'].values), dtype=torch.float32).to(device=device)
        next_action = torch.tensor(np.stack(play['next_action'].values), dtype=torch.float32).to(device=device)

        # do this because forget to replace nans in next state in dataset construction
        next_state = torch.nan_to_num(next_state, nan=0)
        
        # Why does a state have nan values?
        state = torch.nan_to_num(state, nan=0)

        #update_actor, update_critic, target_actor, target_critic

        #print('next state', next_state)
        #state_nan = torch.isnan(next_state)
        #sum_state = sum(state_nan)
        #print('sum state', sum_state)
        #print('state nan', state_nan)
        #print('tens')
        #print(next_state[10:13, :])
        
        # begin updating critic
        target_action = target_actor(next_state)
        
        #print('target action', target_action)
        target_q_input = torch.cat((next_state, target_action), 1)
        target_q_output = target_critic(target_q_input)
        target_q_values = torch.unsqueeze(reward,1) + gamma*target_q_output
        
        #print(f"target q values: {target_q_values}")
        
        # get q values for current state
        current_q_values = update_critic(torch.cat((state, action), 1))
        #print(f"curr q values: {current_q_values}")
        # calculate loss
        critic_loss = target_loss_fn(target_q_values, current_q_values)

        #print('critic loss', critic_loss)
        
        # update critic
        critic_optimizer.zero_grad()
        critic_loss.backward()
        critic_optimizer.step()
        
        #print(f"State: {state.shape}")
        out = update_actor(state)
        #print(f"actor model: {out}")

        # take mean across batched samples to get scalar
        actor_input = torch.cat((state, update_actor(state)), 1)
        #print(f"actor input: {actor_input}")
        actor_loss = -update_critic(actor_input).mean()
        
        #print(f"actor loss: {actor_loss}")

        # update actor
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()

        training_q_loss_list.append(critic_loss)
        training_actor_loss_list.append(actor_loss)

        if (i % int(num_batches/3) == 0): #and (i != 0):
            val_q_loss, val_actor_loss = test_loop(val_data, update_actor, update_critic, F.mse_loss, device)
            val_q_loss_list.append((k,val_q_loss))
            val_actor_loss_list.append((k,val_actor_loss))
            print(f"At epoch {k}, iter {i}: train Q loss = {critic_loss}")
            print(f"At epoch {k}, iter {i}: train Actor loss = {actor_loss}")
            print(f"At epoch {k}, iter {i}: val Q loss = {val_q_loss}")
            print(f"At epoch {k}, iter {i}: val Actor loss = {val_actor_loss}")
            update_actor.train()
            update_critic.train()

        #if len(loss_list) > 1:
        #    if loss_list[-1][1] > loss_list[-2][1]:
        #        break_var = True
                #break
                    
        #if break_var:
        #    break
        

Num batches: 1210
At epoch 0, iter 0: train Q loss = 0.36700448393821716
At epoch 0, iter 0: train Actor loss = 0.14679290354251862
At epoch 0, iter 0: val Q loss = 0.3934997022151947
At epoch 0, iter 0: val Actor loss = 1.6569722890853882
At epoch 0, iter 403: train Q loss = 0.04742977023124695
At epoch 0, iter 403: train Actor loss = -0.684864342212677
At epoch 0, iter 403: val Q loss = 0.31308141350746155
At epoch 0, iter 403: val Actor loss = 3.1866886615753174
At epoch 0, iter 806: train Q loss = 0.04839809238910675
At epoch 0, iter 806: train Actor loss = -1.6457628011703491
At epoch 0, iter 806: val Q loss = 0.31144979596138
At epoch 0, iter 806: val Actor loss = 3.203526020050049
At epoch 0, iter 1209: train Q loss = 0.04650387167930603
At epoch 0, iter 1209: train Actor loss = -2.14587664604187
At epoch 0, iter 1209: val Q loss = 0.30496200919151306
At epoch 0, iter 1209: val Actor loss = 3.2190263271331787
At epoch 1, iter 0: train Q loss = 0.4030025601387024
At epoch 1, iter

In [None]:
#torch.save(update_actor.state_dict(), 'saved_models/actor_critic_actor.pt')
#torch.save(update_critic.state_dict(), 'saved_models/actor_critic_critic.pt')

In [None]:
#### compare random baseline on test data
test_q_loss, test_actor_loss = test_loop(test_data, update_actor, update_critic, F.mse_loss, device)

print(f"Trained Critic test MSE: {test_q_loss}")
print(f"Trained Actor model test MSE: {test_actor_loss}")

In [None]:

def compare_actions(df, row_index, update_actor, update_critic, max_action,verbose=False):
    
    update_actor.eval()
    update_critic.eval()

    sample_row = df.iloc[row_index,:]
    
    state = torch.tensor(sample_row['state'], dtype=torch.float32).to(device=device)
    action = torch.tensor(sample_row['action'], dtype=torch.float32).to(device=device)
    next_action = torch.tensor(sample_row['next_action'], dtype=torch.float32).to(device=device)
    
    actor_next_action = update_actor(torch.unsqueeze(state,0)).detach().cpu()

    fig, ax = plt.subplots()

    ax.scatter(next_action[0].detach().cpu(), next_action[1].detach().cpu(), c='red', s=50)
    ax.scatter(actor_next_action[0,0], actor_next_action[0,1], c='blue', s=50)
    ax.annotate('TRUE', (next_action[0].detach().cpu(), next_action[1].detach().cpu()))
    ax.annotate('PREDICTED', (actor_next_action[0,0].detach().cpu(), actor_next_action[0,1].detach().cpu()))
    plt.title("True vs Predicted Action")
    plt.xlabel("X pos")
    plt.ylabel("Y pos")
    plt.xlim(-max_action[0], max_action[0])
    plt.ylim(-max_action[1], max_action[1])

    plt.savefig("general_images/actor_critic_action_plot.png")
    plt.show()


In [None]:
print(max_action)

for k in range(3,4):
    compare_actions(test_data, k, update_actor, update_critic, max_action.detach().cpu())

In [None]:
def plot_history(history_list, metric, filename):
    
    fig, ax = plt.subplots()

    # plotting
    ax.plot(list(range(1,len(history_list)+1)), history_list)
    plt.title("Training Curve")
    plt.xlabel("Epochs")
    plt.ylabel(f"{metric}")
    plt.show()

    #file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
    fig.savefig(filename)
    
q_loss_list = [0.4091, 0.3140, 0.3079, 0.3054, 0.3026, 0.3057, 0.3069, 0.3074]
#plot_history(q_loss_list, 'MSE Loss', 'training_plots/actor_critic_q_loss.png')

plot_actor_loss_list = [2.9610, 3.1350, 3.2589,3.2115,3.1616, 3.2048, 3.2249]
plot_history(plot_actor_loss_list, 'MSE Actor Loss', 'training_plots/actor_critic_actor_loss.png')
#print(val_actor_loss_list)