In [None]:
import sys
sys.path.append("/home/q621464/Desktop/Thesis/code/decision-transformer-thesis")
sys.path.append("/home/q621464/Desktop/Thesis/code/decision-transformer-thesis/atari")

In [None]:
import csv
import logging
# make deterministic
from atari.mingpt.utils import set_seed
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from torch.utils.data import Dataset
from atari.mingpt.model_atari import GPT, GPTConfig
from atari.mingpt.trainer_atari import Trainer, TrainerConfig
from atari.mingpt.utils import sample
from collections import deque
import random
import torch
import pickle
import blosc
import argparse
from atari.create_dataset import create_dataset
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
class Config():
    def __init__(self, seed=123, context_length=30, epochs=5, model_type='reward_conditioned', num_steps=500000, num_buffers=50, env='SmartClimate', batch_size=128, log_to_wandb=False, trajectories_per_buffer=10, data_dir='../atari/data-for-dt/smart-climate-train-trajectories.pkl') -> None:
        self.seed = seed
        self.context_length = context_length
        self.epochs = epochs
        self.model_type = model_type
        self.num_steps =num_steps
        self.num_buffers = num_buffers
        self.env = env
        self.batch_size = batch_size
        self.log_to_wandb = log_to_wandb
        self.trajectories_per_buffer = trajectories_per_buffer
        self.data_dir = data_dir

In [None]:
args = Config(data_dir='../atari/data-for-dt/mlens-train-trajectories-v1.pkl', env='Movielens', epochs=30)

In [None]:
set_seed(args.seed)

In [None]:
def create_dataset(data_dir):
    with open(data_dir, 'rb') as f:
        trajectories = pickle.load(f)
    obss = []
    actions = []
    returns = [0]
    done_idxs = []
    stepwise_returns = []    
    for traj in trajectories:
        obss += traj['observations'].tolist()
        actions += traj['actions'].tolist()
        stepwise_returns += traj['rewards'].tolist()
        done_idxs += [len(obss)]
        returns += [0]

    actions = np.array(actions)
    returns = np.array(returns)
    stepwise_returns = np.array(stepwise_returns)
    done_idxs = np.array(done_idxs)

    # -- create reward-to-go dataset
    start_index = 0
    rtg = np.zeros_like(stepwise_returns)
    for i in done_idxs:
        i = int(i)
        curr_traj_returns = stepwise_returns[start_index:i]
        for j in range(i-1, start_index-1, -1): # start from i-1
            rtg_j = curr_traj_returns[j-start_index:i-start_index]
            rtg[j] = sum(rtg_j)
        start_index = i
    print('max rtg is %d' % max(rtg))

    # -- create timestep dataset
    start_index = 0
    timesteps = np.zeros(len(actions)+1, dtype=int)
    print(f"total done idx: {len(done_idxs)}")
    for i in done_idxs:
        # print(f"done_idx: {i}")
        i = int(i)
        timesteps[start_index:i+1] = np.arange(i+1 - start_index)
        start_index = i+1
    print('max timestep is %d' % max(timesteps))

    return obss, actions, returns, done_idxs, rtg, timesteps

In [None]:
class StateActionReturnDataset(Dataset):

    def __init__(self, data, block_size, actions, done_idxs, rtgs, timesteps):        
        self.block_size = block_size
        self.vocab_size = max(actions) + 1 # TODO: needs to be changed. Does it change dynamically based on the sampled data?
        self.data = data
        self.actions = actions
        self.done_idxs = done_idxs
        self.rtgs = rtgs
        self.timesteps = timesteps
    
    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # print(f"Fetching for idx: {idx}")
        block_size = self.block_size // 3
        done_idx = idx + block_size # TODO: needs change in the prepared datset for Mlens
        # print(f"done_idx initially: {done_idx}")
        for i in self.done_idxs:
            # print(f"i={i} and idx={idx}")
            if i >= idx + block_size: # first done_idx greater than idx
                done_idx = min(int(i), done_idx)
                break
        idx = done_idx - block_size
        # print(f"done_idx after: {done_idx} and start_idx: {idx}")
        
        states = torch.tensor(np.array(self.data[idx:done_idx]), dtype=torch.float32).reshape(block_size, -1) # 
        (block_size, 4*84*84)
        
        # print(f"There are nan values in the dataloader's batch: {torch.isnan(states).any()}")
        mean = torch.mean(states)
        std = torch.std(states)
        states = (states - mean) / std
        # print(f"mean: {mean}, std: {std} of the batch\n")
        # states = states / 255.
        # print(f"There are nan values in the dataloader's batch after normalization: {torch.isnan(states).any()}")
        actions = torch.tensor(self.actions[idx:done_idx], dtype=torch.long).unsqueeze(1) # (block_size, 1)
        rtgs = torch.tensor(self.rtgs[idx:done_idx], dtype=torch.float32).unsqueeze(1)
        timesteps = torch.tensor(self.timesteps[idx:idx+1], dtype=torch.int64).unsqueeze(1)

        return states, actions, rtgs, timesteps

In [None]:
obss, actions, returns, done_idxs, rtgs, timesteps = create_dataset(args.data_dir)
print(f"input_dim={len(obss[0])}")

# Sanity check
# Are there any nan values in the obss
print(f"There are nan values in the obss: {np.isnan(np.array(obss)).any()}")

print("*" * len(args.env + "Environment"))
print(f"{args.env} Environment")
print("*" * len(args.env + "Environment"))
print(f"total obss: {len(obss)}\ntotal actions: {actions.shape}\ntotal returns: {returns.shape}\ntimesteps: {len(timesteps)}")


In [None]:
# set up logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

In [None]:
train_dataset = StateActionReturnDataset(obss, args.context_length*3, actions, done_idxs, rtgs, timesteps)
print(f"vocab size: {train_dataset.vocab_size}")

In [None]:

mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size, n_layer=6, n_head=8, n_embd=128, model_type=args.model_type, max_timestep=max(timesteps), input_dim=len(obss[0]))
model = GPT(mconf)
model

In [None]:
# initialize a trainer instance and kick off training
epochs = args.epochs
tconf = TrainerConfig(max_epochs=epochs, batch_size=args.batch_size, learning_rate=6e-4, lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*args.context_length*3, num_workers=4, seed=args.seed, model_type=args.model_type, env=args.env, max_timestep=max(timesteps))

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
rtg = 1000
max_ep_len = 1000
trainer = Trainer(model, train_dataset, None, tconf, args.env, rtg=rtg, max_ep_len=max_ep_len, num_eval_episodes=1)
avg_return, predicted_actions, target_actions = trainer.train()
print(f"Average reward achieved: {avg_return:.2f} with RTG: {rtg}")

### Results investigation

In [None]:
from atari.mingpt.envs.movielens_env import CustomActionSpace
action_map = CustomActionSpace().actions_map

train_actions_str = [str(action_map[action]) for action in actions]
predicted_actions_str = [str(action) for action in predicted_actions]
target_actions_str = [str(action) for action in target_actions]

In [None]:
# Distribution of actions in the training set
labels = np.arange(0.5, 5.5, 0.5)
sns.histplot(sorted(train_actions_str), color='blue', alpha=0.5, label='train actions')
sns.histplot(sorted(predicted_actions_str), color='red', alpha=0.5, label='pred actions')
sns.histplot(sorted(target_actions_str), color='green', alpha=0.5, label='target actions')

plt.legend()
plt.title("Action distribution of the training set")
plt.show()


In [None]:
train_actions_unique, counts = np.unique(train_actions_str, return_counts=True)
train_action_count_dict = dict(zip(train_actions_unique, counts))


predicted_actions_unique, counts = np.unique(predicted_actions_str, return_counts=True)
predicted_action_count_dict = dict(zip(predicted_actions_unique, counts))

for action in np.arange(0.5, 5.5, 0.5):
    if str(action) not in predicted_action_count_dict.keys():
        predicted_action_count_dict[str(action)] = 0
    if str(action) not in train_action_count_dict.keys():
        train_action_count_dict[str(action)] = 0

sorted_keys = sorted(predicted_action_count_dict)
predicted_action_count_dict = {key: predicted_action_count_dict[key] for key in sorted_keys}
train_action_count_dict, predicted_action_count_dict

In [None]:
sns.histplot(sorted(train_actions_str))
plt.xlabel('Actions in the train set')
plt.title("Action distribution in the training set")
plt.show()

In [None]:
sns.histplot(sorted(target_actions_str))
plt.xlabel('Target actions in the evaluation set')
plt.title("Target action distribution in the evaluation set")
plt.show()

In [None]:
sns.histplot(sorted(predicted_actions_str))
plt.xlabel('Predicted actions from the evaluation set')
plt.title("Predicted action distribution from the evaluation set")
plt.show()

In [None]:
# Numbers of pairs of bars you want
# N = 4

# Data on X-axis

# Specify the values of blue bars (height)
train_actions = list(train_action_count_dict.values())


# Specify the values of orange bars (height)
pred_actions = list(predicted_action_count_dict.values())


print(train_actions, pred_actions)
# Position of bars on x-axis
ind = np.arange(0.5, 5.5, 0.5)
# ind = np.arange(5)
# print(ind)

# Figure size
plt.figure(figsize=(10,5))

# Width of a bar 
width = 0.2       

# Plotting
plt.bar(ind, train_actions , width, label='train action')
plt.bar(ind + width, pred_actions, width, label='pred action')

plt.xlabel('Here goes x-axis label')
plt.ylabel('Here goes y-axis label')
plt.title('Here goes title of the plot')

# # xticks()
# # First argument - A list of positions at which ticks should be placed
# # Second argument -  A list of labels to place at the given locations
plt.xticks(ind+0.1, train_action_count_dict.keys())

# Finding the best position for legends and putting it
plt.legend(loc='best')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Build confusion matrix
cf_matrix = confusion_matrix(target_actions_str, predicted_actions_str)
# print(np.sum(cf_matrix))
# df_cm = pd.DataFrame(cf_matrix*100/np.sum(cf_matrix), index = sorted(np.unique(target_actions_str)),
#                      columns = sorted(np.unique(target_actions_str)))
df_cm = pd.DataFrame(cf_matrix*100/np.sum(cf_matrix), index = sorted(np.unique(target_actions_str)),
                     columns = sorted(np.unique(target_actions_str)))
# print(df_cm.head())
plt.figure(figsize = (12,8))
# plt.plot(df_cm)
sn.heatmap(df_cm, annot=True, cmap='Blues', cbar=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig("confusion_matrix_Movielens_dataset.jpg")
plt.show()
