In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torchsummary import summary
from torch.cuda.amp import autocast, GradScaler

import numpy as np
import gzip
import pickle
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import gc
import time
import random



import sys
sys.path.append('..')
# from slp_package.slp_functions import create_merged_game_data_df
from slp_package.input_dataset import InputDataSet
import slp_package.pytorch_functions as slp_pytorch_functions

def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using CUDA
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [2]:
source_data = ['ranked','public','mango']

general_features = {
    'stage_name': ['FOUNTAIN_OF_DREAMS','FINAL_DESTINATION','BATTLEFIELD','YOSHIS_STORY','POKEMON_STADIUM','DREAMLAND'],
    'num_players': [2],
    'conclusive': [True],
}
player_features = {
    # 'netplay_code': ['MANG#0'],
    # 'character_name': ['FALCO'],
    # 'character_name': ['FOX', 'FALCO', 'MARTH', 'CAPTAIN_FALCON', 'SHEIK'],
    'character_name': ['FOX', 'CAPTAIN_FALCON', 'SHEIK', 'FALCO', 'GAME_AND_WATCH', 'MARTH', 'LINK', 'ICE_CLIMBERS', 'SAMUS', 'GANONDORF', 'BOWSER', 'MEWTWO', 'YOSHI', 'PIKACHU', 'JIGGLYPUFF', 'NESS', 'DR_MARIO', 'MARIO', 'PEACH', 'ROY', 'LUIGI', 'YOUNG_LINK', 'DONKEY_KONG', 'PICHU', 'KIRBY'],
    # 'character_name': ['FOX', 'CAPTAIN_FALCON', 'SHEIK', 'FALCO', 'GAME_AND_WATCH', 'MARTH', 'LINK', 'ICE_CLIMBERS', 'SAMUS', 'GANONDORF', 'BOWSER', 'MEWTWO', 'YOSHI', 'PIKACHU', 'JIGGLYPUFF', 'NESS', 'DR_MARIO', 'PEACH', 'LUIGI', 'DONKEY_KONG'],
    'type_name': ['HUMAN']
    
}
opposing_player_features = {
    # 'character_name': ['MARTH'],
    # 'netplay_code': ['KOD#0', 'ZAIN#0']
    'type_name': ['HUMAN']
}
label_info = {
    'source': ['player'], # Can be 'general', 'player
    # 'feature': ['netplay_code']
    'feature': ['character_name']
}

In [3]:
dataset = InputDataSet(source_data, general_features, player_features, opposing_player_features, label_info)

print(dataset.dataset['labels'].value_counts())

  processed_df = pd.concat([player_1_df, player_2_df], ignore_index=True)


FOX               103069
FALCO              90719
MARTH              53728
CAPTAIN_FALCON     38006
SHEIK              27623
PEACH              17438
JIGGLYPUFF         16374
SAMUS               9524
ICE_CLIMBERS        6849
GANONDORF           6655
YOSHI               5725
LUIGI               5230
DR_MARIO            4202
PIKACHU             4096
LINK                2502
NESS                2306
DONKEY_KONG         2026
GAME_AND_WATCH      1967
MEWTWO              1775
MARIO               1713
YOUNG_LINK          1447
ROY                 1272
BOWSER               940
KIRBY                556
PICHU                230
Name: labels, dtype: int64


In [4]:
dataset.dataset.head()

Unnamed: 0,stage_name,num_players,conclusive,player_character_name,player_type_name,opposing_player_type_name,player_inputs_np_sub_path,length,labels
0,FINAL_DESTINATION,2,True,FALCO,HUMAN,HUMAN,mango\FALCO\727e819f-8cb3-4c3f-bf0a-ceefa9e41c...,5606,FALCO
1,FINAL_DESTINATION,2,True,FALCO,HUMAN,HUMAN,mango\FALCO\76fe3db5-60de-46bb-8f0d-80d48822a8...,5754,FALCO
2,POKEMON_STADIUM,2,True,MARTH,HUMAN,HUMAN,mango\MARTH\7e6b417f-249d-4629-b6dc-2fe1d95d8f...,6213,MARTH
3,FOUNTAIN_OF_DREAMS,2,True,FOX,HUMAN,HUMAN,mango\FOX\32305eaf-71d8-46e5-a8a1-2c7c890a9baf...,7621,FOX
4,FINAL_DESTINATION,2,True,FALCO,HUMAN,HUMAN,mango\FALCO\a5396c32-6f2c-4b88-8582-f8b875bb55...,7840,FALCO


In [5]:
labels_order =  dataset.number_of_segments_per_game(60,40000)
print(labels_order)
labels_order = labels_order['Label'].values


             Label   Count  Shift
0              FOX  103069  24187
1            FALCO   90717  21015
2            MARTH   53728  13321
3   CAPTAIN_FALCON   38006   8766
4            SHEIK   27623   7393
5            PEACH   17438   4925
6       JIGGLYPUFF   16374   4448
7            SAMUS    9524   2879
8     ICE_CLIMBERS    6849   1952
9        GANONDORF    6655   1601
10           YOSHI    5725   1528
11           LUIGI    5230   1433
12        DR_MARIO    4202   1133
13         PIKACHU    4096   1124
14            LINK    2502    700
15            NESS    2306    727
16     DONKEY_KONG    2026    542
17  GAME_AND_WATCH    1967    462
18          MEWTWO    1775    564
19           MARIO    1713    478
20      YOUNG_LINK    1447    410
21             ROY    1272    336
22          BOWSER     940    274
23           KIRBY     556    155
24           PICHU     230     61


In [6]:
# train_df, test_df  = dataset.train_test_split_dataframes(test_ratio = .20, val = False)

In [7]:
train_df, test_df = dataset.all_segments_train_test_split_dataframes(60, proportion_of_segments=1, test_ratio = .20, val = False)

In [8]:
print(train_df.shape)
print(test_df.shape)
print(test_df.shape[0] / (train_df.shape[0] + test_df.shape[0]))

(53712416, 6)
(13425723, 6)
0.1999716286446367


In [9]:
train_df.head()

Unnamed: 0,player_inputs_np_sub_path,labels,encoded_labels,segment_start_index,segment_index,segment_length
0,ranked\FALCO\4fd1d5a3-ace3-4d28-b529-3ea4a12ce...,FALCO,4,0,0,60
1,ranked\FALCO\4fd1d5a3-ace3-4d28-b529-3ea4a12ce...,FALCO,4,59,1,60
2,ranked\FALCO\4fd1d5a3-ace3-4d28-b529-3ea4a12ce...,FALCO,4,118,2,60
3,ranked\FALCO\4fd1d5a3-ace3-4d28-b529-3ea4a12ce...,FALCO,4,177,3,60
4,ranked\FALCO\4fd1d5a3-ace3-4d28-b529-3ea4a12ce...,FALCO,4,236,4,60


In [10]:
class TrainingDataset(Dataset):
    """
    Custom dataset for loading game segments from compressed numpy files.
    """
    def __init__(self, df, transform=None):
        self.file_paths = df['player_inputs_np_sub_path'].to_numpy()
        self.encoded_labels = df['encoded_labels'].to_numpy()
        self.segment_start_index = df['segment_start_index'].to_numpy()
        # self.segment_index = df['segment_index'].to_numpy()
        self.segment_length = df['segment_length'].to_numpy()
        self.transform = transform

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.file_paths)
    

    def __getitem__(self, idx):
        """Loads and returns a sample from the dataset at the specified index."""
        with gzip.open('/workspace/melee_project_data/input_np/' + self.file_paths[idx].replace('\\','/'), 'rb') as f:
            segment = np.load(f)

        if self.transform:
            segment = self.transform(segment)
        
        # Start and end of the segment
        segment_start = self.segment_start_index[idx]
        segment_end = self.segment_start_index[idx] + self.segment_length[idx]
        
        # Convert to PyTorch tensors
        segment_tensor = torch.from_numpy(segment[:,segment_start:segment_end]).float()
        # label_tensor = torch.tensor(self.encoded_labels[idx], dtype=torch.long)
        return segment_tensor#, label_tensor
    
def prepare_data_loaders(train_df, test_df, batch_size, num_workers):
    # Initialize datasets
    train_dataset = TrainingDataset(train_df)
    # val_dataset = TrainingDataset(file_paths_val, labels_val)
    test_dataset = TrainingDataset(test_df)

    # Initialize data loaders
    loaders = {
        'train': DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True, pin_memory=True,persistent_workers=True),
        'test': DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, pin_memory=True,persistent_workers=True),
        # 'val': DataLoader(val_dataset, batch_size=2**9, num_workers=num_workers, shuffle=False, pin_memory=True,persistent_workers=True)
    }
    return loaders



# ''' Get a batch of data to see the size if we want that information. ''' 
# data_loader_iterator = iter(loaders['train'])
# first_batch = next(data_loader_iterator)
# print(first_batch.shape)



In [11]:
def train_model(model, criterion, optimizer, loaders, device, num_epochs=1):
    scaler = GradScaler()  # Initialize the gradient scaler

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_loader_tqdm = tqdm(loaders['train'], desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch')
        total = 0
        
        for batch_number, target_cpu in enumerate(train_loader_tqdm):
            target_gpu = target_cpu.to(device)
            
            # Resets the optimizer
            optimizer.zero_grad()
            
            # Runs the forward pass with autocasting.
            with autocast():
                output_gpu = model(target_gpu)
                loss = criterion(output_gpu, target_gpu)
            
            # Scales loss and calls backward() to create scaled gradients
            scaler.scale(loss).backward()
            
            # Clip gradients to avoid explosion
            scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            # Before calling step(), check for inf or NaN values in the gradients
            if any(torch.isinf(p.grad).any() or torch.isnan(p.grad).any() for p in model.parameters() if p.grad is not None):
                print("Warning: inf or NaN values in gradients!")
                
            # scaler.step() first unscales the gradients of the optimizer's assigned params.
            # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
            # otherwise, optimizer.step() is skipped.
            scaler.step(optimizer)
            
            # Updates the scale for next iteration.
            scaler.update()

            # Update progress
            train_loss += loss.item()
            total += target_gpu.size(0)
            train_loader_tqdm.set_postfix(loss=f'{train_loss / (total):.4f}')


def evaluate_model(model, criterion, loaders, loader, device):
    model.eval()
    eval_loss = 0
    total = 0
    with torch.no_grad():
        eval_loader_tqdm = tqdm(loaders[loader], unit = 'batch')
        
        for batch_number, target_cpu in enumerate(eval_loader_tqdm):
            target_gpu = target_cpu.to(device)
            output_gpu = model(target_gpu)
            
            eval_loss += criterion(output_gpu * 10, target_gpu * 10).item()
            total += target_gpu.size(0)
            eval_loader_tqdm.set_postfix(loss=f'{eval_loss / (total):.4f}') 
            
    print(f'Evaluated Loss: {eval_loss / total:.6f}')
    

In [12]:
from ResNet_Autoencoder_Model import ResNet_Autoencoder

# Build model
model = ResNet_Autoencoder().to('cuda')

# With the size of an input we can get a model summary.
summary(model, input_size=(9, 60))

# Check that the output shape and target shape match
# training_example = torch.rand(9, 2 ** 12).to('cuda')
# print('Target shape:', training_example.shape)
# model.eval()
# output = model(training_example)
# print('Output shape:', output.shape)

## Optionally compile the model
# import torch_tensorrt
# model = torch.compile(model, mode = 'default')
model = torch.compile(model,mode = 'max-autotune')
# model = torch.compile(model, backend="torch_tensorrt")
# model = torch.compile(model, backend="torch_tensorrt",mode = 'max-autotune')


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1               [-1, 64, 60]             640
       BatchNorm1d-2               [-1, 64, 60]             128
              ReLU-3               [-1, 64, 60]               0
            Conv1d-4               [-1, 64, 60]          12,352
       BatchNorm1d-5               [-1, 64, 60]             128
              ReLU-6               [-1, 64, 60]               0
            Conv1d-7              [-1, 256, 60]          16,640
       BatchNorm1d-8              [-1, 256, 60]             512
            Conv1d-9              [-1, 256, 60]           2,560
      BatchNorm1d-10              [-1, 256, 60]             512
             ReLU-11              [-1, 256, 60]               0
Encoder_Bottleneck-12              [-1, 256, 60]               0
           Conv1d-13               [-1, 64, 60]          16,448
      BatchNorm1d-14               [-1

In [13]:
# import torch._dynamo
# torch._dynamo.config.suppress_errors = True


# Pepare data loaders
batch_size =  32 * 16 * 5
num_workers = 22
loaders = prepare_data_loaders(train_df, test_df, batch_size, num_workers)

criterion = nn.MSELoss(reduction = 'sum')
optimizer = Adam(model.parameters(), lr=0.001)
num_epochs = 1

# # # This seems to sometimes help
# gc.collect()
# torch.cuda.empty_cache()
# Train the model
# start_time = time.time()
train_model(model, criterion, optimizer, loaders, 'cuda', num_epochs)
# print(f'Batch Size: {batch_size}, Training time: {time.time() - start_time:.2f}')

# Again, this sometimes seems to help
# gc.collect()
# torch.cuda.empty_cache()

# Evaluate the trained model
evaluate_model(model, criterion, loaders, 'test', 'cuda')

Epoch 1/1:   0%|          | 1/20982 [00:32<188:47:46, 32.39s/batch, loss=623.1656]



Epoch 1/1:   0%|          | 2/20982 [00:33<82:44:28, 14.20s/batch, loss=621.3223] 



Epoch 1/1:   0%|          | 3/20982 [00:34<45:37:22,  7.83s/batch, loss=621.9302]



Epoch 1/1:   0%|          | 4/20982 [00:34<28:10:44,  4.84s/batch, loss=622.5852]



Epoch 1/1:   0%|          | 5/20982 [00:34<18:32:12,  3.18s/batch, loss=623.0424]



Epoch 1/1:   0%|          | 6/20982 [00:34<12:43:21,  2.18s/batch, loss=623.1840]



Epoch 1/1:   0%|          | 7/20982 [00:35<9:02:25,  1.55s/batch, loss=623.3801] 



Epoch 1/1:   0%|          | 8/20982 [00:35<6:37:08,  1.14s/batch, loss=623.2238]



Epoch 1/1:   0%|          | 9/20982 [00:35<5:00:04,  1.16batch/s, loss=623.0957]



Epoch 1/1:   0%|          | 10/20982 [00:35<3:54:27,  1.49batch/s, loss=622.8861]



Epoch 1/1:   0%|          | 11/20982 [00:36<3:09:46,  1.84batch/s, loss=623.2883]



Epoch 1/1:   0%|          | 12/20982 [00:36<2:38:24,  2.21batch/s, loss=623.3662]



Epoch 1/1:   0%|          | 13/20982 [00:36<2:17:11,  2.55batch/s, loss=623.4457]



Epoch 1/1:   0%|          | 14/20982 [00:36<2:01:36,  2.87batch/s, loss=623.2290]



Epoch 1/1:   0%|          | 15/20982 [00:37<1:51:45,  3.13batch/s, loss=623.3580]



Epoch 1/1:   0%|          | 16/20982 [00:37<1:44:15,  3.35batch/s, loss=623.5219]



Epoch 1/1:   0%|          | 17/20982 [00:37<1:39:14,  3.52batch/s, loss=623.6534]



Epoch 1/1:   0%|          | 18/20982 [00:37<1:35:12,  3.67batch/s, loss=623.5248]



Epoch 1/1:   0%|          | 19/20982 [00:38<1:33:09,  3.75batch/s, loss=623.7845]



Epoch 1/1:   0%|          | 20/20982 [00:38<1:31:02,  3.84batch/s, loss=623.6399]



Epoch 1/1:   0%|          | 21/20982 [00:38<1:30:20,  3.87batch/s, loss=623.4208]



Epoch 1/1:   0%|          | 22/20982 [00:38<1:29:10,  3.92batch/s, loss=623.4106]



Epoch 1/1:   0%|          | 23/20982 [00:39<1:28:51,  3.93batch/s, loss=623.6198]



Epoch 1/1:   0%|          | 24/20982 [00:39<1:27:56,  3.97batch/s, loss=623.7004]



Epoch 1/1:   0%|          | 42/20982 [00:44<6:10:06,  1.06s/batch, loss=415.4047]


KeyboardInterrupt: 

In [None]:
# train_model(model, criterion, optimizer, loaders, 'cuda', 3)
# # print(f'Batch Size: {batch_size}, Training time: {time.time() - start_time:.2f}')

# # Again, this sometimes seems to help
# # gc.collect()
# # torch.cuda.empty_cache()

# # Evaluate the trained model
evaluate_model(model, criterion, loaders, 'test', 'cuda')

  0%|          | 0/5245 [00:00<?, ?batch/s]AUTOTUNE mm(2560x16384, 16384x64)
  triton_mm_145 0.2424 ms 100.0%
  triton_mm_146 0.2437 ms 99.5%
  triton_mm_149 0.2444 ms 99.2%
  triton_mm_148 0.2469 ms 98.2%
  mm 0.2509 ms 96.6%
  triton_mm_143 0.2642 ms 91.8%
  triton_mm_141 0.3215 ms 75.4%
  triton_mm_144 0.3850 ms 63.0%
  triton_mm_142 0.4004 ms 60.5%
  triton_mm_150 0.4701 ms 51.6%
SingleProcess AUTOTUNE takes 3.2618 seconds
AUTOTUNE mm(2560x64, 64x64)
  triton_mm_158 0.0054 ms 100.0%
  triton_mm_157 0.0059 ms 90.8%
  mm 0.0061 ms 87.5%
  triton_mm_153 0.0061 ms 87.5%
  triton_mm_155 0.0061 ms 87.5%
  triton_mm_160 0.0061 ms 87.5%
  triton_mm_161 0.0061 ms 87.5%
  triton_mm_152 0.0069 ms 78.1%
  triton_mm_162 0.0071 ms 75.7%
  triton_mm_154 0.0072 ms 75.0%
SingleProcess AUTOTUNE takes 3.0065 seconds
AUTOTUNE mm(2560x64, 64x16384)
  triton_mm_182 0.2099 ms 100.0%
  triton_mm_179 0.2109 ms 99.5%
  triton_mm_184 0.2112 ms 99.4%
  triton_mm_185 0.2112 ms 99.4%
  triton_mm_177 0.2120 ms 9

KeyboardInterrupt: 

In [None]:
def predict(model, criterion, loaders, loader, device):
    model.eval()
    # eval_loss = 0
    # total = 0
    predictions_gpu = []
    targets_cpu = []
    with torch.no_grad():
        eval_loader_tqdm = tqdm(loaders[loader], unit = 'batch')
        
        for batch_number, target_cpu in enumerate(eval_loader_tqdm):
            targets_cpu.append(target_cpu)
            target_gpu = target_cpu.to(device)
            # output_gpu
            predictions_gpu.append(model(target_gpu).to('cpu'))
            
            # eval_loss += criterion(output_gpu, target_gpu).item()
            # total += target_gpu.size(0)
            # eval_loader_tqdm.set_postfix(loss=f'{eval_loss / (total):.4f}') 
            
    # print(f'Evaluated Loss: {eval_loss / total:.6f}')
    
    return predictions_gpu, targets_cpu

In [None]:
pred, target = predict(model, criterion, loaders, 'test','cuda')

100%|█████████▉| 5244/5245 [14:12<00:00,  7.55batch/s]AUTOTUNE mm(1083x16384, 16384x64)
  mm 0.1147 ms 100.0%
  triton_mm_194 0.1475 ms 77.8%
  triton_mm_193 0.1636 ms 70.1%
  triton_mm_197 0.1665 ms 68.9%
  triton_mm_196 0.2097 ms 54.7%
  triton_mm_191 0.2212 ms 51.9%
  triton_mm_189 0.2335 ms 49.1%
  triton_mm_188 0.3799 ms 30.2%
  triton_mm_192 0.3840 ms 29.9%
  triton_mm_190 0.4076 ms 28.1%
SingleProcess AUTOTUNE takes 3.2280 seconds
AUTOTUNE mm(1083x64, 64x64)
  triton_mm_201 0.0051 ms 100.0%
  triton_mm_205 0.0051 ms 100.0%
  triton_mm_206 0.0051 ms 100.0%
  triton_mm_209 0.0051 ms 100.0%
  triton_mm_208 0.0054 ms 95.2%
  triton_mm_203 0.0055 ms 92.5%
  triton_mm_200 0.0060 ms 86.0%
  mm 0.0061 ms 83.3%
  triton_mm_202 0.0061 ms 83.3%
  triton_mm_204 0.0061 ms 83.3%
SingleProcess AUTOTUNE takes 3.0356 seconds
AUTOTUNE mm(1083x64, 64x16384)
  mm 0.0973 ms 100.0%
  triton_mm_225 0.1004 ms 96.9%
  triton_mm_230 0.1004 ms 96.9%
  triton_mm_224 0.1005 ms 96.8%
  triton_mm_232 0.1014 m

In [None]:
from sklearn.metrics import mean_squared_error 
m = mean_squared_error(target, pred)
print(m)

  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5245,) + inhomogeneous part.

In [None]:
print(m)

In [None]:
n = 10
pred_example = pred[n][0].numpy()
target_example = target[n][0].numpy()

In [None]:
print(pred_example.shape)
print(target_example.shape)

(9, 60)
(9, 60)


In [None]:
n = 30
print(pred_example[:,n:5+n])
print(target_example[:,n:5+n])
# for i in range(12):
#     print(i)
#     print(target_example[5:9,i*5:5+i*5])

[[nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]
 [nan nan nan nan nan]]
[[ 0.775   0.7625  0.7     0.65    0.625 ]
 [ 0.6125  0.625   0.7     0.75    0.775 ]
 [ 0.      0.      0.      0.      0.    ]
 [ 0.      0.      0.      0.     -0.9875]
 [ 1.      1.      1.      1.      1.    ]
 [ 0.      0.      0.      0.      0.    ]
 [ 0.      0.      0.      0.      0.    ]
 [ 0.      0.      0.      0.      0.    ]
 [ 0.      0.      0.      0.      0.    ]]


In [None]:
np.sum(target_example[5:9])