In [1]:
import numpy as np
import pandas as pd
import json
import pickle

import warnings

#import tensorflow as tf

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torch.utils.data

In [2]:
data_df = pd.read_pickle('all_players.pkl')

data_df.loc[:, 'next_action'] = data_df.groupby('playIndex').action.shift(-1)

# drop na if using sarsa model
sarsa_df = data_df.dropna(axis=0, how='any', inplace=False)

sarsa_df.head()

Unnamed: 0,playIndex,timeIndex,state,next_state,reward,action,next_action
0,0,48,"[11.4, 42.67, 12.42, 42.51, 21.17, 43.14, 20.0...","[11.39, 42.66, 11.58, 42.54, 21.24, 43.33, 20....",0.01,"[0.01, 0.01]","[-0.02, 0.03]"
1,0,49,"[11.39, 42.66, 11.58, 42.54, 21.24, 43.33, 20....","[11.41, 42.63, 11.3, 42.45, 21.34, 43.51, 20.3...",0.04,"[-0.02, 0.03]","[-0.04, 0.07]"
2,0,50,"[11.41, 42.63, 11.3, 42.45, 21.34, 43.51, 20.3...","[11.45, 42.56, 11.32, 42.34, 21.48, 43.67, 20....",0.08,"[-0.04, 0.07]","[-0.08, 0.11]"
3,0,51,"[11.45, 42.56, 11.32, 42.34, 21.48, 43.67, 20....","[11.53, 42.45, 11.15, 42.28, 21.67, 43.81, 20....",0.14,"[-0.08, 0.11]","[-0.09, 0.13]"
4,0,52,"[11.53, 42.45, 11.15, 42.28, 21.67, 43.81, 20....","[11.62, 42.32, 11.51, 42.1, 21.88, 43.92, 21.2...",0.16,"[-0.09, 0.13]","[-0.16, 0.16]"


In [3]:
len(sarsa_df)

204941

In [58]:
x = sarsa_df.loc[1:2, 'state']

stacked = np.stack(x.values)

dl = torch.utils.data.DataLoader(sarsa_df)

In [4]:
# return batch_size of plays
def sample_from_plays(batch_size, df, seed):
    
    np.random.seed(seed)
    
    indices = np.random.choice(len(df), batch_size, replace=False)
    
    plays = df.iloc[indices, :]
    
    return plays

'''
Function that splits the data into a training, validation, and test set
'''
def split_data(dataset, train_split, seed):
    np.random.seed(seed)
    indices = list(range(len(dataset)))
    np.random.shuffle(indices)

    train_num = int(len(dataset)*train_split)
    val_num = (len(dataset) - int(len(dataset)*train_split))//2

    train_indices = indices[0:train_num]
    val_indices = indices[train_num:train_num+val_num]
    test_indices = indices[train_num+val_num:]

    #check to make sure slices correct
    assert len(dataset) == len(train_indices) + len(val_indices) + len(test_indices)

    #dataset = help.normalize(train_indices, dataset)

    train_data = dataset.iloc[train_indices,:]
    val_data = dataset.iloc[val_indices,:]
    test_data = dataset.iloc[test_indices,:]

    return train_data, val_data, test_data
    

In [5]:
BATCH_SIZE = 128

train_data, val_data, test_data = split_data(sarsa_df, 0.7, 2430)

#train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE)
#val_loader = torch.utils.data.DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
#test_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)


print(test_data.columns)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(device)

print(f"Length of training data: {len(train_data)}")

Index(['playIndex', 'timeIndex', 'state', 'next_state', 'reward', 'action',
       'next_action'],
      dtype='object')
cpu
Length of training data: 143458


  return torch._C._cuda_getDeviceCount() > 0


In [9]:
# define hyperparameters
iterations = 25000
state_size = len(data_df.loc[0,'state'])
action_size = 2
gamma = 0.99
COPY_TARGETS_INDEX = 5

# create models
eval_net = Qnet(state_size, action_size).to(device=device)
target_net = type(eval_net)(state_size, action_size).to(device=device)
target_net.load_state_dict(eval_net.state_dict())

# define loss function
loss_fn = nn.MSELoss()
# define optimizers
optimizer = optim.Adam(eval_net.parameters())

print(f"State size: {state_size}")

State size: 46


In [10]:
'''
Neural Network that will represent the Q-function
Input: concatenated (state,action) pair
Output: value
'''


class Qnet(nn.Module):
    def __init__(self, state_size, action_size):
        super(Qnet, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        
        self.linear1 = nn.Linear(state_size+action_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, 128)
        self.linear4 = nn.Linear(128,1)
        
        self.relu = nn.ReLU()
        
    def forward(self, inp):
        
        layer1_output = self.relu(self.linear1(inp))
        layer2_output = self.relu(self.linear2(layer1_output))
        layer3_output = self.relu(self.linear3(layer2_output))
        output = self.linear4(layer3_output)
        
        return output


'''
class Qnet(nn.Module):
    def __init__(self, state_size, action_size):
        super(Qnet, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        
        self.linear1 = nn.Linear(state_size+action_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, 1)
        
        self.relu = nn.ReLU()
        
    def forward(self, inp):
        
        layer1_output = self.relu(self.linear1(inp))
        layer2_output = self.relu(self.linear2(layer1_output))
        output = self.linear3(layer2_output)
        
        return output
'''


'\nclass Qnet(nn.Module):\n    def __init__(self, state_size, action_size):\n        super(Qnet, self).__init__()\n        self.state_size = state_size\n        self.action_size = action_size\n        \n        self.linear1 = nn.Linear(state_size+action_size, 128)\n        self.linear2 = nn.Linear(128, 256)\n        self.linear3 = nn.Linear(256, 1)\n        \n        self.relu = nn.ReLU()\n        \n    def forward(self, inp):\n        \n        layer1_output = self.relu(self.linear1(inp))\n        layer2_output = self.relu(self.linear2(layer1_output))\n        output = self.linear3(layer2_output)\n        \n        return output\n'

In [66]:
#for batch, labels in train_loader:
#    print(batch)
#    print(labels)
    
#    break

In [11]:

def test_loop(test_df, q_model, test_loss_fn, device):
    size = len(test_df)
    test_loss = 0
    
    with torch.no_grad():
        for row_index in range(0,len(test_df)):
            #try:
            row = test_df.iloc[row_index, :]

            state = torch.tensor(row['state'], dtype=torch.float32).to(device=device)
            action = torch.tensor(row['action'], dtype=torch.float32).to(device=device)
            reward = torch.tensor(row['reward'], dtype=torch.float32).to(device=device)
            next_state = torch.tensor(row['next_state'], dtype=torch.float32).to(device=device)
            next_action = torch.tensor(row['next_action'], dtype=torch.float32).to(device=device)

            # Update Q
            eval_input = torch.cat((state, action), 0)

            q_eval = q_model(eval_input)

            target_input = torch.cat((next_state, next_action), 0)
            q_target = reward + gamma*q_model(target_input)

            loss = test_loss_fn(q_eval, q_target)
            test_loss += loss
            #except Exception as e:
                #print(f"Row index: {row_index}")
                #print(F"Exception: {e}")
                #print(row)
    
        test_loss /= size
            #print(f"Avg loss: {test_loss:>8f} \n")

    return test_loss
        
#avg_test_loss = test_loop(test_data, eval_net, F.mse_loss, device)

In [13]:
########
### Random loss is .325604
### Use random weights as baseline
########
untrained_net = type(eval_net)(state_size, action_size).to(device=device)
untrained_net.load_state_dict(eval_net.state_dict())

TypeError: super(type, obj): obj must be an instance or subtype of type

In [20]:

epochs = 10

loss_list = []

num_batches = int(len(train_data)/BATCH_SIZE)+1

print(f"Num batches: {num_batches}")

for k in range(epochs):
    
    with warnings.catch_warnings(record=True) as w:
        
        
        #warnings.simplefilter("error")
        
        for i in range(num_batches):
        
            # update weights of target network
            if i % COPY_TARGETS_INDEX == 0:
                target_net.load_state_dict(eval_net.state_dict())


            #play = sample_from_plays(BATCH_SIZE, train_data, seed=k)
            # get sample from data
            start_index = i*BATCH_SIZE
            end_index = min(len(train_data), (i+1)*BATCH_SIZE)
                            
            play = train_data.iloc[list(range(start_index, end_index)), :]

            state = torch.tensor(np.stack(play['state'].values), dtype=torch.float32).to(device=device)
            action = torch.tensor(np.stack(play['action'].values), dtype=torch.float32).to(device=device)
            reward = torch.tensor(play['reward'].values, dtype=torch.float32).to(device=device)
            next_state = torch.tensor(np.stack(play['next_state'].values), dtype=torch.float32).to(device=device)
            next_action = torch.tensor(np.stack(play['next_action'].values), dtype=torch.float32).to(device=device)

            # Update Q
            eval_input = torch.cat((state, action), 1)
            q_eval = eval_net(eval_input)

            target_input = torch.cat((next_state, next_action), 1)
            q_target = torch.unsqueeze(reward,1) + gamma*target_net(target_input)

            loss = loss_fn(q_eval, q_target)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % (num_batches/3) == 0:
                avg_val_loss = test_loop(val_data, eval_net, F.mse_loss, device)
                loss_list.append((k,avg_val_loss))
                print(f"At epoch {k}: avg. val loss = {avg_val_loss}")
        
        if len(w):
            print(f"Batch index {i}")
            print(f"warning: {w}")


Num batches: 1121
At epoch 0: avg. val loss = 17.812326431274414


KeyboardInterrupt: 

In [None]:
loss_list = []

for k in range(iterations):
    
    with warnings.catch_warnings(record=True) as w:
        #warnings.simplefilter("error")
        
        # update weights of target network
        if k % COPY_TARGETS_INDEX == 0:
            target_net.load_state_dict(eval_net.state_dict())


        play = sample_from_plays(128, train_data, seed=k)

        state = torch.tensor(np.stack(play['state'].values), dtype=torch.float32).to(device=device)
        action = torch.tensor(np.stack(play['action'].values), dtype=torch.float32).to(device=device)
        reward = torch.tensor(play['reward'].values, dtype=torch.float32).to(device=device)
        next_state = torch.tensor(np.stack(play['next_state'].values), dtype=torch.float32).to(device=device)
        next_action = torch.tensor(np.stack(play['next_action'].values), dtype=torch.float32).to(device=device)

        # Update Q
        eval_input = torch.cat((state, action), 1)
        q_eval = eval_net(eval_input)

        target_input = torch.cat((next_state, next_action), 1)
        q_target = torch.unsqueeze(reward,1) + gamma*target_net(target_input)

        loss = loss_fn(q_eval, q_target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        if (k % 1000 == 0) and (k != 0):
            avg_val_loss = test_loop(val_data, eval_net, F.mse_loss, device)
            loss_list.append((k,avg_val_loss))
            print(f"At iter {k}: avg. val loss = {avg_val_loss}")
        
        if len(w):
            print(f"row index {k}")
            print(f"warning: {w}")