In [2]:
import numpy as np
import pandas as pd
import json
import pickle

import tensorflow as tf

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
# create model

'''
Actor
Input: states
Output: action 
'''
class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        
        self.linear1 = nn.Linear(state_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, action_size)
        
        self.relu = nn.ReLU()
        
    def forward(self, state):
        
        lay_out = self.linear1(state)
        layer1_output = self.relu(lay_out)
        layer2_output = self.relu(self.linear2(layer1_output))
        output = self.relu(self.linear3(layer2_output))
        
        return output

    
'''
Critic
Input: state, action pair
Output: (value)
'''
class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        
        self.linear1 = nn.Linear(state_size+action_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()
        
    def forward(self, inp):
        
        layer1_output = self.relu(self.linear1(inp))
        layer2_output = self.relu(self.linear2(layer1_output))
        output = self.relu(self.linear3(layer2_output))
        
        return output

In [5]:
df = pd.read_pickle('all_players.pkl')

df.loc[:, 'next_action'] = df.groupby('playIndex').action.shift(-1)

# drop na if using sarsa model
sarsa_df = df.dropna(axis=0, how='any', inplace=False)

sarsa_df.head()

Unnamed: 0,playIndex,timeIndex,state,next_state,reward,action,next_action
0,0,48,"[11.4, 42.67, 12.42, 42.51, 21.17, 43.14, 20.0...","[11.39, 42.66, 11.58, 42.54, 21.24, 43.33, 20....",0.01,"[0.01, 0.01]","[-0.02, 0.03]"
1,0,49,"[11.39, 42.66, 11.58, 42.54, 21.24, 43.33, 20....","[11.41, 42.63, 11.3, 42.45, 21.34, 43.51, 20.3...",0.04,"[-0.02, 0.03]","[-0.04, 0.07]"
2,0,50,"[11.41, 42.63, 11.3, 42.45, 21.34, 43.51, 20.3...","[11.45, 42.56, 11.32, 42.34, 21.48, 43.67, 20....",0.08,"[-0.04, 0.07]","[-0.08, 0.11]"
3,0,51,"[11.45, 42.56, 11.32, 42.34, 21.48, 43.67, 20....","[11.53, 42.45, 11.15, 42.28, 21.67, 43.81, 20....",0.14,"[-0.08, 0.11]","[-0.09, 0.13]"
4,0,52,"[11.53, 42.45, 11.15, 42.28, 21.67, 43.81, 20....","[11.62, 42.32, 11.51, 42.1, 21.88, 43.92, 21.2...",0.16,"[-0.09, 0.13]","[-0.16, 0.16]"


In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

cpu


  return torch._C._cuda_getDeviceCount() > 0


In [28]:
# return batch_size of plays
def sample_from_plays(batch_size, df):
    
    indices = np.random.choice(len(df), batch_size, replace=False)
    
    plays = df.iloc[indices, :]
    
    return plays

'''
Function that splits the data into a training, validation, and test set
'''
def split_data(dataset, train_split, seed):
    np.random.seed(seed)
    indices = list(range(len(dataset)))
    np.random.shuffle(indices)

    train_num = int(len(dataset)*train_split)
    val_num = (len(dataset) - int(len(dataset)*train_split))//2

    train_indices = indices[0:train_num]
    val_indices = indices[train_num:train_num+val_num]
    test_indices = indices[train_num+val_num:]

    #check to make sure slices correct
    assert len(dataset) == len(train_indices) + len(val_indices) + len(test_indices)

    #dataset = help.normalize(train_indices, dataset)

    train_data = dataset.loc[train_indices,:]
    val_data = dataset.loc[val_indices,:]
    test_data = dataset.loc[test_indices,:]

    return train_data, val_data, test_data
    
    

In [59]:
# define hyperparameters
iterations = 5
state_size = len(df.loc[0,'state'])
action_size = 2
gamma = 0.99

# create models
critic = Critic(state_size, action_size).to(device=device)
actor = Actor(state_size, action_size).to(device=device)
critic_target = critic.to(device=device)
actor_target = actor.to(device=device)

# define loss function
critic_loss_function = nn.MSELoss()
# define optimizers
actor_optimizer = optim.Adam(actor.parameters())
critic_optimizer = optim.Adam(critic.parameters())

print(f"State size: {state_size}")

State size: 46


In [71]:
train_data, val_data, test_data = split_data(df, 0.7, 2430)

print(test_data.columns)
#print(test_data.head())
#print(test_data[test_data.columns])

Index(['playIndex', 'timeIndex', 'state', 'next_state', 'reward', 'action'], dtype='object')


In [61]:
'''
for name, param in actor.named_parameters():
    if param.requires_grad:
        print(name, param.data)
'''

'\nfor name, param in actor.named_parameters():\n    if param.requires_grad:\n        print(name, param.data)\n'

In [62]:


for k in range(iterations):
    
    play = sample_from_plays(4, train_data)

    state = torch.tensor(np.stack(play['state'].values), dtype=torch.float32).to(device=device)
    next_state = torch.tensor(np.stack(play['next_state'].values), dtype=torch.float32).to(device=device)
    reward = torch.tensor(play['reward'].values).to(device=device)
    true_action = torch.tensor(np.stack(play['action'].values), dtype=torch.float32).to(device=device)

    actor_predicted_states = actor_target(state)
    critic_input_from_actor = torch.cat((next_state, actor_states), 1)

    # set y values
    y = reward + gamma*critic_target(critic_input_from_actor)
    y = y.to(torch.float32)

    #
    critic_input_true_actions = torch.cat((next_state, true_action), 1)
    true_y = critic(critic_input_true_actions)

    # update critic
    critic.zero_grad()
    critic_loss = critic_loss_function(y, true_y)
    critic_loss.backward()
    critic_optimizer.step()

    # update actor
    policy_loss = -critic(torch.cat((state, actor(state)),1))
    policy_loss = policy_loss.mean()
    policy_loss.backward()

        

In [76]:
'''
Define metrics
'''


def test_loop(test_df, actor_model, loss_fn, device):
    size = len(test_df)
    test_loss = 0

    with torch.no_grad():
        for row in test_df[test_df.columns]:
            
            state = torch.tensor(np.stack(play['state'].values), dtype=torch.float32).to(device=device)
            next_state = torch.tensor(np.stack(play['next_state'].values), dtype=torch.float32).to(device=device)
            reward = torch.tensor(play['reward'].values).to(device=device)
            true_action = torch.tensor(np.stack(play['action'].values), dtype=torch.float32).to(device=device)

            actor_states = actor_target(state)
            
            print(actor_states.shape)
            print(true_action.shape)
            
            individual_loss = loss_fn(actor_states, next_state)

            test_loss += individual_loss

        test_loss /= size
        correct /= size
        print(f"Avg loss: {test_loss:>8f} \n")
    
    return test_loss
        
avg_test_loss = test_loop(test_data, actor, F.mse_loss, device)



torch.Size([4, 2])
torch.Size([4, 2])


  individual_loss = loss_fn(actor_states, next_state)


RuntimeError: The size of tensor a (2) must match the size of tensor b (46) at non-singleton dimension 1

In [None]:
#####################
#####################
## Test code
#####################
#####################

In [None]:

for k in range(iterations):
    
    play = sample_from_plays(4, train_data)

    state = torch.tensor(np.stack(play['state'].values), dtype=torch.float32).to(device=device)
    next_state = torch.tensor(np.stack(play['next_state'].values), dtype=torch.float32).to(device=device)
    reward = torch.tensor(play['reward'].values).to(device=device)
    true_action = torch.tensor(np.stack(play['action'].values), dtype=torch.float32).to(device=device)

    actor_states = actor_target(state)
    critic_input_from_actor = torch.cat((next_state, actor_states), 1)

    # set y values
    y = reward + gamma*critic_target(critic_input_from_actor)
    y = y.to(torch.float32)

    #
    critic_input_true_actions = torch.cat((next_state, true_action), 1)
    true_y = critic(critic_input_true_actions)

    # update critic
    critic.zero_grad()
    critic_loss = critic_loss_function(y, true_y)
    critic_loss.backward()
    critic_optimizer.step()

    # update actor
    policy_loss = -critic(torch.cat((state, actor(state)),1))
    policy_loss = policy_loss.mean()
    policy_loss.backward()

        

In [3]:
# Don't need buffer because have all states
import ast

def from_np_array(array_string):
    array_string = ','.join(array_string.replace('[ ', '[').split())
    index = 0
    #array_string = ','.join(array_string.replace('\n', ' ').split())
    try:
        return np.array(ast.literal_eval(array_string))
    except:
        index += 1


df = pd.read_csv("all_players.csv", index_col=0, converters={'state':from_np_array, 'next_state':from_np_array, 'action':from_np_array})

df.head()

#from_np_array(df['state'][0])

Unnamed: 0,playIndex,timeIndex,state,next_state,reward,action
0,0,48,"[11.4, 42.67, 55.74, 36.49, 47.81, 23.35, 47.3...","[11.39, 42.66, 55.07, 36.68, 46.88, 23.75, 46....",0.01,"[0.01, 0.01]"
1,0,49,"[11.39, 42.66, 55.07, 36.68, 46.88, 23.75, 46....","[11.41, 42.63, 54.4, 36.86, 45.96, 24.14, 45.6...",0.04,"[-0.02, 0.03]"
2,0,50,"[11.41, 42.63, 54.4, 36.86, 45.96, 24.14, 45.6...","[11.45, 42.56, 53.73, 37.04, 45.04, 24.53, 44....",0.08,"[-0.04, 0.07]"
3,0,51,"[11.45, 42.56, 53.73, 37.04, 45.04, 24.53, 44....","[11.53, 42.45, 53.07, 37.21, 44.11, 24.92, 43....",0.14,"[-0.08, 0.11]"
4,0,52,"[11.53, 42.45, 53.07, 37.21, 44.11, 24.92, 43....","[11.62, 42.32, 52.41, 37.37, 43.17, 25.29, 43....",0.16,"[-0.09, 0.13]"


In [233]:
actor_states = actor_target(next_state)

print(f"Actor states: {actor_states}")

# concat state (N, X) and action (N, Y)
inp = torch.cat((next_state, actor_states), 1)

ta = critic_target(inp)

print(f"Critic targets: {ta}")

print(reward.shape)
print(ta.shape)

y = reward + gamma*ta
print(y)

print(state.shape)
print(actor(state).shape)

-critic(torch.cat((state, actor(state)),1)).mean()

Actor states: tensor([[0.0000, 7.6483],
        [0.0000, 6.5292]], grad_fn=<ReluBackward0>)
Critic targets: tensor([[3.4059],
        [3.6254]], grad_fn=<ReluBackward0>)
torch.Size([2])
torch.Size([2, 1])
tensor([[3.9719, 4.0819],
        [4.1891, 4.2991]], dtype=torch.float64, grad_fn=<AddBackward0>)
torch.Size([2, 46])
torch.Size([2, 2])


tensor(-3.4946, grad_fn=<NegBackward>)

In [225]:
play = sample_from_plays(2, df)

print(torch.tensor(np.stack(play['state'].values)))

print("################")

state = torch.tensor(np.stack(play['state'].values), dtype=torch.float32)
next_state = torch.tensor(np.stack(play['next_state'].values), dtype=torch.float32)
reward = torch.tensor(play['reward'].values)
true_action = torch.tensor(np.stack(play['action'].values), dtype=torch.float32)

# print(play['action'].values)
# print(torch.tensor(play['action']))

print(state)
print(next_state)
print(reward)
print(true_action)
print("####################")

print(state.size())
print(true_action.size())
cat =torch.cat((state,true_action), 1)
print(cat)
print(cat.size())

tensor([[ 88.2000,   4.6300,  84.4500,   6.4400,  82.7200,  14.6000,  87.4600,
           4.1400,  91.2100,  16.9700,  84.9500,  18.2500,  87.3600,  21.5200,
          89.0500,  15.4500,  64.7300,  12.2100,  74.0600,  27.6600,  88.8400,
          13.4600,  88.0400,   6.0000,  88.6000,  22.1000,  87.9500,   7.1700,
          87.3100,   5.5000,  89.7500,  13.2300,  84.8300,   7.1400,  89.0600,
          19.7400,  92.1000,  22.9200,  82.8900,   6.9900,  88.8600,   6.9700,
          88.8600,   9.5400,  88.1400,   4.6600],
        [102.2200,  13.6100,  66.9300,  19.8600,  80.6100,  19.3300,  84.1500,
          28.1300,  83.7600,   8.2300,  83.6500,  16.9700,  82.7500,  21.3900,
          84.4700,  35.4800,  78.6300,   9.7200,  82.0200,  12.1900,  82.7500,
          10.5500,  86.0800,  24.4400,  75.7200,  19.4500,  85.6500,  33.7300,
          83.1700,  21.9400,  84.8100,  17.4000,  88.8100,  10.3300,  85.1000,
          12.2300,  87.3000,  24.1100,  82.9300,  18.6300,  82.9200,  18.2300,
  