In [None]:
import sys
sys.path.append("/home/q621464/Desktop/Thesis/code/decision-transformer-thesis")
sys.path.append("/home/q621464/Desktop/Thesis/code/decision-transformer-thesis/smart-climate")

In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
import gym
import numpy as np
import torch
import wandb

import argparse
import pickle
import random
import sys

from decision_transformer.evaluation.evaluate_episodes import evaluate_episode, evaluate_episode_rtg
from decision_transformer.models.decision_transformer import DecisionTransformer
from decision_transformer.models.mlp_bc import MLPBCModel
from decision_transformer.training.act_trainer import ActTrainer
from decision_transformer.training.seq_trainer import SequenceTrainer
from tqdm import tqdm

In [None]:
def discount_cumsum(x, gamma):
    discount_cumsum = np.zeros_like(x)
    discount_cumsum[-1] = x[-1]
    for t in reversed(range(x.shape[0]-1)):
        discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1]
    return discount_cumsum

In [None]:
class Config():
    def __init__(self, seed=123, context_length=30, epochs=5, model_type='reward_conditioned', num_steps=500000, num_buffers=50, env='SmartClimate', batch_size=128, log_to_wandb=False, trajectories_per_buffer=10, train_data_dir='../atari/data-for-dt/smart-climate-train-trajectories-v2.pkl', val_data_dir='../atari/data-for-dt/smart-climate-val-trajectories-v2.pkl', test_data_dir='../atari/data-for-dt/smart-climate-test-trajectories-v2.pkl') -> None:
        self.seed = seed
        self.context_length = context_length
        self.epochs = epochs
        self.model_type = model_type
        self.num_steps =num_steps
        self.num_buffers = num_buffers
        self.env = env
        self.batch_size = batch_size
        self.log_to_wandb = log_to_wandb
        self.trajectories_per_buffer = trajectories_per_buffer
        self.train_data_dir = train_data_dir
        self.val_data_dir = val_data_dir
        self.test_data_dir = test_data_dir
        self.dim_reductor = None

In [None]:
def experiment(
        exp_prefix,
        variant,
):
    device = variant.get('device', 'cuda')
    log_to_wandb = variant.get('log_to_wandb', False)

    env_name, dataset = variant['env'], variant['dataset']
    model_type = variant['model_type']
    group_name = f'{exp_prefix}-{env_name}-{dataset}'
    exp_prefix = f'{group_name}-{random.randint(int(1e5), int(1e6) - 1)}'

    if env_name == 'hopper':
        env = gym.make('Hopper-v3')
        max_ep_len = 1000
        env_targets = [3600, 1800]  # evaluation conditioning targets
        scale = 1000.  # normalization for rewards/returns
    elif env_name == 'halfcheetah':
        env = gym.make('HalfCheetah-v3')
        max_ep_len = 100
        env_targets = [12000, 6000]
        scale = 1000.
    elif env_name == 'walker2d':
        env = gym.make('Walker2d-v3')
        max_ep_len = 1000
        env_targets = [5000, 2500]
        scale = 1000.
    elif env_name == 'reacher2d':
        from decision_transformer.envs.reacher_2d import Reacher2dEnv
        env = Reacher2dEnv()
        max_ep_len = 100
        env_targets = [76, 40]
        scale = 10.
    elif env_name == 'smartclimate':
        from decision_transformer.envs.smart_climate_env import SmartClimateEnv
        env = SmartClimateEnv()
        max_ep_len = 100
        env_targets = [100, 70]
        scale = 1
    else:
        raise NotImplementedError

    if model_type == 'bc':
        env_targets = env_targets[:1]  # since BC ignores target, no need for different evaluations

    # state_dim = env.observation_space.shape[0]
    # act_dim = env.action_space.shape[0]
    


    # load dataset
    # dataset_path = f'data/{env_name}-{dataset}-v2.pkl'
    dataset_path = f"../smart-climate/data/smart-climate/smart-climate-train-trajectories-v5.pkl"
    with open(dataset_path, 'rb') as f:
        trajectories = pickle.load(f)[0:500]

    # save all path information into separate lists
    mode = variant.get('mode', 'normal')
    states, traj_lens, returns = [], [], []
    for path in trajectories:
        if mode == 'delayed':  # delayed: all rewards moved to end of trajectory
            path['rewards'][-1] = path['rewards'].sum()
            path['rewards'][:-1] = 0.
        states.append(path['observations'])
        traj_lens.append(len(path['observations']))
        returns.append(path['rewards'].sum())
    traj_lens, returns = np.array(traj_lens), np.array(returns)
    
    state_dim = states[0].shape[1]
    act_dim = 1

    # used for input normalization
    states = np.concatenate(states, axis=0)
    state_mean, state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6
    num_timesteps = sum(traj_lens)

    print('=' * 50)
    print(f'Starting new experiment: {env_name} {dataset}')
    print(f'{len(traj_lens)} trajectories, {num_timesteps} timesteps found')
    print(f'Average return: {np.mean(returns):.2f}, std: {np.std(returns):.2f}')
    print(f'Max return: {np.max(returns):.2f}, min: {np.min(returns):.2f}')
    print('=' * 50)

    K = variant['K']
    batch_size = variant['batch_size']
    num_eval_episodes = variant['num_eval_episodes']
    pct_traj = variant.get('pct_traj', 1.)

    # only train on top pct_traj trajectories (for %BC experiment)
    num_timesteps = max(int(pct_traj*num_timesteps), 1)
    sorted_inds = np.argsort(returns)  # lowest to highest
    num_trajectories = 1
    timesteps = traj_lens[sorted_inds[-1]]
    ind = len(trajectories) - 2
    while ind >= 0 and timesteps + traj_lens[sorted_inds[ind]] <= num_timesteps:
        timesteps += traj_lens[sorted_inds[ind]]
        num_trajectories += 1
        ind -= 1
    sorted_inds = sorted_inds[-num_trajectories:]

    # used to reweight sampling so we sample according to timesteps instead of trajectories
    p_sample = traj_lens[sorted_inds] / sum(traj_lens[sorted_inds])

    def get_batch(batch_size=256, max_len=K):
        batch_inds = np.random.choice(
            np.arange(num_trajectories),
            size=batch_size,
            replace=True,
            p=p_sample,  # reweights so we sample according to timesteps
        )

        s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []
        for i in range(batch_size):
            traj = trajectories[int(sorted_inds[batch_inds[i]])]
            si = random.randint(0, traj['rewards'].shape[0] - 1)
            
            # get sequences from dataset
            s.append(traj['observations'][si:si + max_len].reshape(1, -1, state_dim))
            a.append(traj['actions'][si:si + max_len].reshape(1, -1, act_dim))
            r.append(traj['rewards'][si:si + max_len].reshape(1, -1, 1))
            
            # print(f"reward shape: {r[-1].shape}")
            # print(f"size of s: {s[-1].shape}")
            if 'terminals' in traj:
                d.append(traj['terminals'][si:si + max_len].reshape(1, -1))
            else:
                d.append(traj['dones'][si:si + max_len].reshape(1, -1))
            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= max_ep_len] = max_ep_len-1  # padding cutoff
            rtg.append(discount_cumsum(traj['rewards'][si:], gamma=1.)[:s[-1].shape[1] + 1].reshape(1, -1, 1))
            if rtg[-1].shape[1] <= s[-1].shape[1]:
                rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

            # padding and state + reward normalization
            tlen = s[-1].shape[1]
            s[-1] = np.concatenate([np.zeros((1, max_len - tlen, state_dim)), s[-1]], axis=1)
            s[-1] = (s[-1] - state_mean) / state_std
            a[-1] = np.concatenate([np.ones((1, max_len - tlen, act_dim)) * -10., a[-1]], axis=1)
            r[-1] = np.concatenate([np.zeros((1, max_len - tlen, 1)), r[-1]], axis=1)
            d[-1] = np.concatenate([np.ones((1, max_len - tlen)) * 2, d[-1]], axis=1)
            rtg[-1] = np.concatenate([np.zeros((1, max_len - tlen, 1)), rtg[-1]], axis=1) / scale
            timesteps[-1] = np.concatenate([np.zeros((1, max_len - tlen)), timesteps[-1]], axis=1)
            mask.append(np.concatenate([np.zeros((1, max_len - tlen)), np.ones((1, tlen))], axis=1))

        s = torch.from_numpy(np.concatenate(s, axis=0)).to(dtype=torch.float32, device=device)
        a = torch.from_numpy(np.concatenate(a, axis=0)).to(dtype=torch.float32, device=device)
        r = torch.from_numpy(np.concatenate(r, axis=0)).to(dtype=torch.float32, device=device)
        d = torch.from_numpy(np.concatenate(d, axis=0)).to(dtype=torch.long, device=device)
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).to(dtype=torch.float32, device=device)
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).to(dtype=torch.long, device=device)
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).to(device=device)


        return s, a, r, d, rtg, timesteps, mask
    def eval_episodes(target_rew):
        def fn(model):
            returns, lengths = [], []
            for _ in tqdm(range(num_eval_episodes), disable=False):
                with torch.no_grad():
                    if model_type == 'dt':
                        ret, length = evaluate_episode_rtg(
                            env,
                            state_dim,
                            act_dim,
                            model,
                            max_ep_len=max_ep_len,
                            scale=scale,
                            target_return=target_rew/scale,
                            mode=mode,
                            state_mean=state_mean,
                            state_std=state_std,
                            device=device,
                        )
                    else:
                        ret, length = evaluate_episode(
                            env,
                            state_dim,
                            act_dim,
                            model,
                            max_ep_len=max_ep_len,
                            target_return=target_rew/scale,
                            mode=mode,
                            state_mean=state_mean,
                            state_std=state_std,
                            device=device,
                        )
                returns.append(ret)
                lengths.append(length)
            return {
                f'target_{target_rew}_return_mean': np.mean(returns),
                f'target_{target_rew}_return_std': np.std(returns),
                f'target_{target_rew}_length_mean': np.mean(lengths),
                f'target_{target_rew}_length_std': np.std(lengths),
            }
        return fn

    if model_type == 'dt':
        model = DecisionTransformer(
            state_dim=state_dim,
            act_dim=act_dim,
            max_length=K,
            vocab_size=25,
            max_ep_len=max_ep_len,
            hidden_size=variant['embed_dim'],
            n_layer=variant['n_layer'],
            n_head=variant['n_head'],
            n_inner=4*variant['embed_dim'],
            activation_function=variant['activation_function'],
            n_positions=1024,
            resid_pdrop=variant['dropout'],
            attn_pdrop=variant['dropout'],
        )
    elif model_type == 'bc':
        model = MLPBCModel(
            state_dim=state_dim,
            act_dim=act_dim,
            max_length=K,
            hidden_size=variant['embed_dim'],
            n_layer=variant['n_layer'],
        )
    else:
        raise NotImplementedError

    model = model.to(device=device)

    warmup_steps = variant['warmup_steps']
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=variant['learning_rate'],
        weight_decay=variant['weight_decay'],
    )
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lambda steps: min((steps+1)/warmup_steps, 1)
    )

    if model_type == 'dt':
        trainer = SequenceTrainer(
            model=model,
            optimizer=optimizer,
            batch_size=batch_size,
            get_batch=get_batch,
            scheduler=scheduler,
            loss_fn=lambda s_hat, a_hat, r_hat, s, a, r: torch.mean((a_hat - a)**2),
            eval_fns=[eval_episodes(tar) for tar in env_targets],
        )
    elif model_type == 'bc':
        trainer = ActTrainer(
            model=model,
            optimizer=optimizer,
            batch_size=batch_size,
            get_batch=get_batch,
            scheduler=scheduler,
            loss_fn=lambda s_hat, a_hat, r_hat, s, a, r: torch.mean((a_hat - a)**2),
            eval_fns=[eval_episodes(tar) for tar in env_targets],
        )

    if log_to_wandb:
        wandb.init(
            name=exp_prefix,
            group=group_name,
            project='decision-transformer',
            config=variant
        )
        # wandb.watch(model)  # wandb has some bug

    for iter in range(variant['max_iters']):
        outputs = trainer.train_iteration(num_steps=variant['num_steps_per_iter'], iter_num=iter+1, print_logs=True)
        if log_to_wandb:
            wandb.log(outputs)


In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--env', type=str, default='smartclimate')
parser.add_argument('--dataset', type=str, default='medium')  # medium, medium-replay, medium-expert, expert
parser.add_argument('--mode', type=str, default='normal')  # normal for standard setting, delayed for sparse
parser.add_argument('--K', type=int, default=20)
parser.add_argument('--pct_traj', type=float, default=1.)
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--model_type', type=str, default='dt')  # dt for decision transformer, bc for behavior cloning
parser.add_argument('--embed_dim', type=int, default=128)
parser.add_argument('--n_layer', type=int, default=3)
parser.add_argument('--n_head', type=int, default=1)
parser.add_argument('--activation_function', type=str, default='relu')
parser.add_argument('--dropout', type=float, default=0.1)
parser.add_argument('--learning_rate', '-lr', type=float, default=1e-4)
parser.add_argument('--weight_decay', '-wd', type=float, default=1e-4)
parser.add_argument('--warmup_steps', type=int, default=10000)
parser.add_argument('--num_eval_episodes', type=int, default=100)
parser.add_argument('--max_iters', type=int, default=10)
parser.add_argument('--num_steps_per_iter', type=int, default=100)
parser.add_argument('--device', type=str, default='cuda')
parser.add_argument('--log_to_wandb', '-w', type=bool, default=False)

In [None]:
args = parser.parse_args("")

In [None]:
experiment('gym-experiment', variant=vars(args))

In [None]:
dataset_path = f"../smart-climate/data/smart-climate/smart-climate-train-trajectories-v5.pkl"
with open(dataset_path, 'rb') as f:
    trajectories = pickle.load(f)

In [None]:
actions = [traj['actions'] for traj in trajectories]

In [None]:
logits = torch.randn(64, 20)

In [None]:
from torch.nn import functional as F
probs = F.softmax(logits, dim=0)
probs.shape

In [None]:
from transformers import DecisionTransformerModel

model_name = "edbeeching/decision-transformer-gym-hopper-expert"
model = DecisionTransformerModel.from_pretrained(model_name)


In [None]:
np.concatenate(actions).min(), np.concatenate(actions).max()

In [None]:
transformers.__version__

In [None]:
from transformers import DecisionTransformerModel

In [None]:
state_embeddings = torch.randn(64, 20, 128)
returns_embeddings = torch.randn(64, 20, 128)

In [None]:
batch_size = 64
seq_length = 20
hidden_size = 128

In [None]:
stacked_inputs = torch.stack(
    (returns_embeddings, state_embeddings), dim=1
).permute(0, 2, 1, 3).reshape(batch_size, 2*seq_length, hidden_size)
stacked_inputs.shape

In [None]:
attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long)
attention_mask.shape


In [None]:
stacked_attention_mask = torch.stack(
    (attention_mask, attention_mask), dim=1
).permute(0, 2, 1).reshape(batch_size, 2*seq_length)
stacked_attention_mask.shape

In [None]:
stacked_attention_mask.shape

In [None]:
x = torch.randn(64, 2, 20, 128)
x.shape

In [None]:
x[:,].shape

In [None]:
states = torch.randn(10, 5)
states.mean()

In [None]:
import pandas as pd
from datetime import datetime, timedelta

# Sample data
data = {'visit_id': [1, 1, 2, 2],
        'timestamp': ['2022-12-03 10:50:20', '2022-12-03 10:58:00', '2022-12-03 11:05:17', '2022-12-03 11:20:40'],
        'event': ['Event A', 'Event B', 'Event C', 'Event D']}
df = pd.DataFrame(data)
df

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort the dataframe by visit_id and timestamp
df = df.sort_values(['visit_id', 'timestamp'])

# Initialize an empty list to store new rows
new_rows = []

In [None]:
# Function to calculate new timestamps at 10-second intervals
def generate_new_timestamps(row, next_row):
    time_diff = (next_row['timestamp'] - row['timestamp']).total_seconds()
    num_intervals = int(time_diff / 10)
    new_timestamps = [row['timestamp'] + timedelta(seconds=60 * i) for i in range(1, num_intervals)]
    
    # Ensure new timestamps do not exceed the next event's timestamp
    new_timestamps = [ts for ts in new_timestamps if ts < next_row['timestamp']]
    return new_timestamps

In [None]:
# Iterate through groups (each group is a visit)
for _, group in df.groupby('visit_id'):
    group = group.reset_index(drop=True)
    
    # Iterate through the rows within the group to fill gaps
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]
        
        new_rows.append(current_row.to_dict())
        
        # Generate new timestamps at 10-second intervals
        new_timestamps = generate_new_timestamps(current_row, next_row)
        for new_timestamp in new_timestamps:
            new_rows.append({'visit_id': current_row['visit_id'], 'timestamp': new_timestamp, 'event': current_row['event']})
    
    # Add the last row in the group
    new_rows.append(group.iloc[-1].to_dict())

# Create a new dataframe with the filled gaps
new_df = pd.DataFrame(new_rows)

# print(new_df)
new_df

In [None]:
new_df[new_df['visit_id'] == 1].head(20)

In [53]:
# Altogether
import pandas as pd
from datetime import datetime, timedelta

# Sample data
data = {
    'visit_id': [1, 1, 2, 2],
    'timestamp': ['2021-05-20 06:30:37.899', '2021-05-20 06:33:05.626', '2021-05-20 06:40:05.626', '2021-05-20 06:43:05.626'],
    'feature1': [10, 20, 15, 30],
    'feature2': [5, 8, 6, 12],
    # ... add other features ...
}
df = pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert to datetime object

# Sort the dataframe by visit_id and timestamp
df = df.sort_values(['visit_id', 'timestamp'])

# Initialize an empty list to store new rows
new_rows = []

# Function to calculate new timestamps at 10-second intervals
def generate_new_timestamps(row, next_row):
    time_diff = (next_row['timestamp'] - row['timestamp']).total_seconds()
    num_intervals = int(time_diff / 30)
    new_timestamps = [row['timestamp'] + timedelta(seconds=30 * i) for i in range(1, num_intervals)]
    
    # Ensure new timestamps do not exceed the next event's timestamp
    new_timestamps = [ts for ts in new_timestamps if ts < next_row['timestamp']]
    return new_timestamps

# Iterate through groups (each group is a visit)
for _, group in df.groupby('visit_id'):
    group = group.reset_index(drop=True)
    
    # Iterate through the rows within the group to fill gaps
    for i in range(len(group) - 1):
        current_row = group.iloc[i]
        next_row = group.iloc[i + 1]
        
        new_rows.append(current_row.to_dict())
        
        # Generate new timestamps at 10-second intervals
        new_timestamps = generate_new_timestamps(current_row, next_row)
        for new_timestamp in new_timestamps:
            new_row = current_row.copy()
            new_row['timestamp'] = new_timestamp
            new_rows.append(new_row.to_dict())
    
    # Add the last row in the group
    new_rows.append(group.iloc[-1].to_dict())

# Create a new dataframe with the filled gaps
new_df = pd.DataFrame(new_rows)

# print(new_df)
new_df


Unnamed: 0,visit_id,timestamp,feature1,feature2
0,1,2021-05-20 06:30:37.899,10,5
1,1,2021-05-20 06:31:07.899,10,5
2,1,2021-05-20 06:31:37.899,10,5
3,1,2021-05-20 06:32:07.899,10,5
4,1,2021-05-20 06:33:05.626,20,8
5,2,2021-05-20 06:40:05.626,15,6
6,2,2021-05-20 06:40:35.626,15,6
7,2,2021-05-20 06:41:05.626,15,6
8,2,2021-05-20 06:41:35.626,15,6
9,2,2021-05-20 06:42:05.626,15,6


In [54]:
new_df['timestamp'].diff().dt.total_seconds()

0         NaN
1      30.000
2      30.000
3      30.000
4      57.727
5     420.000
6      30.000
7      30.000
8      30.000
9      30.000
10     30.000
11     30.000
Name: timestamp, dtype: float64

### Results investigation

In [None]:
epoch_losses = results['epoch_losses']
val_losses = results['val_losses']
predicted_actions = results['predicted _actions']
target_actions = results['target_actions']
train_epoch_accuracies = results['train_accuracies']
val_accuracies = results['val_accuracies']
returns = results['epoch_returns']

In [None]:
plt.plot(epoch_losses, label='train_loss')
plt.plot(val_losses, label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title(f"train and val loss for {len(train_dataset)} train and {len(val_dataset)} datapoints with batch size {args.batch_size}")
# plt.show()
# plt.savefig(f'Figures/train_and_val_loss_for_{len(train_dataset)}_datapoints.jpg')

In [None]:
plt.plot(train_epoch_accuracies, label='train_acc')
plt.plot(val_accuracies, label='val_acc')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title(f"train and val acc for {len(train_dataset)} train and {len(val_dataset)} datapoints")
# plt.show()
# plt.savefig(f'Figures/train_and_val_loss_for_{len(train_dataset)}_datapoints.jpg')

In [None]:
from atari.mingpt.envs.smart_climate_env import CustomActionSpace
action_map = CustomActionSpace().actions_map

train_actions_str = [str(action_map[action]) for action in actions]
predicted_actions_str = [str(action) for action in predicted_actions]
target_actions_str = [str(action) for action in target_actions]
val_actions_str = [str(action_map[action]) for action in actions_val]

In [None]:
train_actions_unique, counts = np.unique(train_actions_str, return_counts=True)
train_action_count_dict = dict(zip(train_actions_unique, counts))

target_actions_unique, counts = np.unique(target_actions_str, return_counts=True)
target_action_count_dict = dict(zip(target_actions_unique, counts))

predicted_actions_unique, counts = np.unique(predicted_actions_str, return_counts=True)
predicted_action_count_dict = dict(zip(predicted_actions_unique, counts))

val_actions_unique, counts = np.unique(val_actions_str, return_counts=True)
val_action_count_dict = dict(zip(val_actions_unique, counts))

for action in np.arange(16.0, 28.5, 0.5):
    if str(action) not in predicted_action_count_dict.keys():
        predicted_action_count_dict[str(action)] = 0
    if str(action) not in train_action_count_dict.keys():
        train_action_count_dict[str(action)] = 0
    if str(action) not in target_action_count_dict.keys():
        target_action_count_dict[str(action)] = 0
    if str(action) not in val_action_count_dict.keys():
        val_action_count_dict[str(action)] = 0

sorted_keys = sorted(predicted_action_count_dict)
predicted_action_count_dict = {key: predicted_action_count_dict[key] for key in sorted_keys}
train_action_count_dict = {key: train_action_count_dict[key] for key in sorted_keys}
target_action_count_dict = {key: target_action_count_dict[key] for key in sorted_keys}
val_action_count_dict = {key: val_action_count_dict[key] for key in sorted_keys}

sorted(train_action_count_dict.keys()) == sorted(predicted_action_count_dict.keys())

In [None]:
accuracy = accuracy_score(target_actions_str, predicted_actions_str)
print(f"Evaluation Accuracy: {accuracy * 100:.4f}")

In [None]:
from sklearn.metrics import f1_score
macro_f1 = f1_score(target_actions_str, predicted_actions_str, average='macro')

# Calculate the micro F1 score
micro_f1 = f1_score(target_actions_str, predicted_actions_str, average='micro')

print("Macro F1 Score:", macro_f1)
print("Micro F1 Score:", micro_f1)

In [None]:
classes = np.arange(16, 28.5, 0.5)
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

for target, prediction in zip(target_actions, predicted_actions):
    if target == prediction:
        correct_pred[target] += 1
    total_pred[target] += 1
# print accuracy for each class
for classname, correct_count in correct_pred.items():
    if total_pred[classname] > 0:
        accuracy = 100 * float(correct_count) / total_pred[classname]
    else:
        accuracy = 0
    print(f'Accuracy for class: {classname} is {accuracy:.1f} %')

In [None]:
from sklearn.metrics import accuracy_score, classification_report
# Calculate and display the classification report for each class
class_names = sorted(np.unique(train_actions_str))
report = classification_report(target_actions_str, predicted_actions_str, target_names=class_names)
print("Classification Report:")
print(report)

In [None]:
# Sample data for three datasets
data1 = train_action_count_dict.values()
data4 = val_action_count_dict.values()
data2 = target_action_count_dict.values()
data3 = predicted_action_count_dict.values()

# Define the x-axis labels for the bars
x_labels = train_action_count_dict.keys()

# Create a figure and three subplots side by side
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 5))

# Plot the bar plots on each subplot
bar_width = 1
x = np.arange(len(x_labels))

# Bar plot for Dataset 1
axes[0].bar(x, data1, bar_width, color='blue', label='train actions', edgecolor='black', alpha=0.5)
axes[0].set_xticks(x)
axes[0].set_xticklabels(x_labels, rotation=90)
axes[0].set_title('Target temperature from train dataset')

# Bar plot for Dataset 4
axes[1].bar(x, data4, bar_width, color='purple', label='predicted actions', edgecolor='black', alpha=0.5)
axes[1].set_xticks(x)
axes[1].set_xticklabels(x_labels, rotation=90)
axes[1].set_title('Target temperature from the the validation dataset')

# Bar plot for Dataset 2
axes[2].bar(x, data2, bar_width, color='green', label='target actions', edgecolor='black', alpha=0.5)
axes[2].set_xticks(x)
axes[2].set_xticklabels(x_labels, rotation=90)
axes[2].set_title('Target temperature from the test set')

# Bar plot for Dataset 3
axes[3].bar(x, data3, bar_width, color='red', label='predicted actions', edgecolor='black', alpha=0.5)
axes[3].set_xticks(x)
axes[3].set_xticklabels(x_labels, rotation=90)
axes[3].set_title('Target temperature from the predictions on the test set')



# Add labels and title to the overall figure
plt.suptitle('Bar Plots of Train, Eval and Predicted target temperatures')
plt.tight_layout()
plt.savefig('Figures/temp.jpg')
plt.show()


In [None]:
import numpy as np

def compute_probabilities(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    total_counts = len(labels)
    probabilities = counts / total_counts
    return unique_labels, probabilities

def kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))


# Compute probabilities and unique labels for each list
labels1, probabilities1 = compute_probabilities(train_actions_str)
labels2, probabilities2 = compute_probabilities(val_actions_str)

# Union of unique labels from both lists
all_labels = np.union1d(labels1, labels2)

# Fill missing labels with zero probabilities
probabilities1_all = np.zeros_like(all_labels, dtype=float)
probabilities2_all = np.zeros_like(all_labels, dtype=float)
probabilities1_all[np.searchsorted(all_labels, labels1)] = probabilities1
probabilities2_all[np.searchsorted(all_labels, labels2)] = probabilities2

# Compute KL-divergence from list1 to list2
kl_divergence_1to2 = kl_divergence(probabilities1_all, probabilities2_all)

# Compute KL-divergence from list2 to list1
kl_divergence_2to1 = kl_divergence(probabilities2_all, probabilities1_all)

print("KL-Divergence from list1 to list2:", kl_divergence_1to2)
print("KL-Divergence from list2 to list1:", kl_divergence_2to1)


In [None]:
train_actions = [float(action) for action in train_actions_str]
val_actions = [float(action) for action in val_actions_str]

print(np.mean(train_actions), np.mean(val_actions))
print(np.median(train_actions), np.median(val_actions))
print(np.std(train_actions), np.std(val_actions))

In [None]:
# Build confusion matrix
unique_labels = np.unique(np.concatenate((target_actions_str, predicted_actions_str)))

# Build confusion matrix
cf_matrix = confusion_matrix(target_actions_str, predicted_actions_str, labels=unique_labels)

# Create a DataFrame from the confusion matrix for better visualization
df_cm = pd.DataFrame(cf_matrix, index=unique_labels, columns=unique_labels)

plt.figure(figsize = (16,10))
# plt.plot(df_cm)
sn.heatmap(df_cm, annot=True, cmap='Blues', cbar=True)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title(f"Confusion matrix for {len(train_dataset)} train and {len(predicted_actions)} datapoints")
# plt.show()
plt.savefig(f'Figures/Confusion matrix for {len(train_dataset)} train and {len(predicted_actions)} datapoints.jpg')


In [None]:
sns.lineplot(returns)
plt.title("Return using sampling from the test env")
plt.show()
# plt.savefig(f"Figures/return_with_sampling_500_train_traj.jpg")

In [None]:
sns.histplot(sorted(train_actions_str))
plt.xlabel('Target Temperatures')
plt.title("Target Temperature distribution in the training set")
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.histplot(sorted(target_actions_str))
plt.xlabel('Target temperature')
plt.title("Target temperatures distribution in the evaluation set")
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.histplot(sorted(predicted_actions_str))
plt.xlabel('Target temperatures from the evaluation set')
plt.title("Predicted target temperatures distribution from the evaluation set")
plt.xticks(rotation=90)
plt.show()

In [None]:
# # Numbers of pairs of bars you want
# # N = 4

# # Data on X-axis

# # Specify the values of blue bars (height)
# # train_actions = list(train_action_count_dict.values())
# target_actions = list(target_action_count_dict.values())


# # Specify the values of orange bars (height)
# pred_actions = list(predicted_action_count_dict.values())


# print(len(target_actions), len(pred_actions))
# # Position of bars on x-axis
# ind = np.arange(16.0, 28.5, 0.5)
# # ind = np.arange(5)
# # print(ind)

# # Figure size
# plt.figure(figsize=(10,5))

# # Width of a bar 
# width = 0.2       

# # Plotting
# plt.bar(ind, target_actions , width, label='target action')
# plt.bar(ind + width, pred_actions, width, label='pred action')

# plt.xlabel('Actions')
# plt.ylabel('Count')
# plt.title('Action distribution comparison between target and predictions')

# # # xticks()
# # # First argument - A list of positions at which ticks should be placed
# # # Second argument -  A list of labels to place at the given locations
# # plt.xticks(ind+0.1, train_action_count_dict.keys())
# plt.xticks(ind+0.1, ind)

# # Finding the best position for legends and putting it
# plt.legend(loc='best')
# plt.xticks(rotation=90)
# plt.show()

### Hyperparameter Tuning

In [None]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
# from your_pytorch_model import YourModel  # Import your PyTorch model

def objective(trial):
    # Sample hyperparameters to be tuned by Optuna
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    num_hidden_units = trial.suggest_int('num_hidden_units', 32, 256)
    
    # Load the val_dataset
    val_data_dir = "../atari/data-for-dt/smart-climate-val-trajectories-v2.pkl"
    # Create the dataset first
    obss, actions, returns, done_idxs, rtgs, timesteps = create_dataset(val_data_dir, total_trajectories=8000, apply_dim_reduction=True)
    
    # Split your data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(obss, actions, test_size=0.1, random_state=123)
    
    val_dataset = StateActionReturnDataset(X_train, args.context_length*3, y_train, done_idxs, rtgs, timesteps)
    print(f"vocab size: {val_dataset.vocab_size}")
    
    # Define the model here
    mconf = GPTConfig(val_dataset.vocab_size, val_dataset.block_size, n_layer=6, n_head=8, n_embd=128, model_type=args.model_type, max_timestep=max(timesteps), input_dim=len(obss[0]))
    model = GPT(mconf)

    # Train the model for a fixed number of epochs
    epochs = 10
    tconf = TrainerConfig(max_epochs=epochs, batch_size=args.batch_size, learning_rate=6e-4, lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*args.context_length*3, num_workers=4, seed=args.seed, model_type=args.model_type, env=args.env, max_timestep=max(timesteps))
    
    rtg = 100
    max_ep_len = 100
    trainer = Trainer(model, train_dataset, None, tconf, args.env, rtg=rtg, max_ep_len=max_ep_len, num_eval_episodes=1)
    
    trainer.train()
    val_loss, val_accuracy = trainer.evaluate_model()

    # Return the validation loss as the objective value to be minimized
    return val_loss


In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

In [None]:
best_params = study.best_params
learning_rate = best_params['learning_rate']
num_hidden_units = best_params['num_hidden_units']
learning_rate, num_hidden_units
# # Train your final model using the best hyperparameters
# final_model = model(num_hidden_units)
# optimizer = optim.Adam(final_model.parameters(), lr=learning_rate)

# Train the model on the entire training data
# ...


In [None]:
# Load the val_dataset
val_data_dir = "../atari/data-for-dt/smart-climate-val-trajectories-v2.pkl"
# Create the dataset first
obss, actions, returns, done_idxs, rtgs, timesteps = create_dataset(val_data_dir, total_trajectories=8000, apply_dim_reduction=True)
# Sanity check
# Are there any nan values in the obss
print(f"There are nan values in the obss: {np.isnan(np.array(obss)).any()}")
print("*" * len(args.env + "Environment"))
print(f"{args.env} Environment")
print("*" * len(args.env + "Environment"))
print(f"total obss: {len(obss)}\ntotal actions: {actions.shape}\ntotal returns: {returns.shape}\ntimesteps: {len(timesteps)}")

In [None]:
val_dataset = StateActionReturnDataset(obss, args.context_length*3, actions, done_idxs, rtgs, timesteps)
print(f"vocab size: {train_dataset.vocab_size}")

In [None]:
# initialize a trainer instance and kick off training
epochs = 10
tconf = TrainerConfig(max_epochs=epochs, batch_size=args.batch_size, learning_rate=6e-4, lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_dataset)*args.context_length*3, num_workers=4, seed=args.seed, model_type=args.model_type, env=args.env, max_timestep=max(timesteps))

In [None]:
rtg = 100
max_ep_len = 100
trainer = Trainer(model, train_dataset, None, tconf, args.env, rtg=rtg, max_ep_len=max_ep_len, num_eval_episodes=10)
# avg_return, predicted_actions, target_actions, epoch_losses = trainer.train()
# print(f"Average reward achieved: {avg_return:.2f} with RTG: {rtg}")