In [None]:
import torch
import torch.nn as nn
from base.data import get_data_loaders, get_data_loaders_wandb, get_data_loaders_loocv
from base.train import Trainer, eval_metrics
import pandas as pd
import wandb

In [None]:
class MLPHead(nn.Module):
    def __init__(self, in_dim, out_dim, use_norm, dropout, n_layers):
        super(MLPHead, self).__init__()
        l_sz = [in_dim] + [in_dim//2**(i+1) for i in range(n_layers)] + [out_dim]
        layers = []
        for i in range(1, len(l_sz)-1):
            layers.append(nn.Linear(l_sz[i-1], l_sz[i]))
            if use_norm == 'batch':
                layers.append(nn.BatchNorm1d(l_sz[i]))
            elif use_norm == 'layer':
                layers.append(nn.LayerNorm(l_sz[i]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
        layers.append(nn.Linear(l_sz[-2], l_sz[-1]))
        self.head = nn.Sequential(*layers)

    def forward(self, x):
        return self.head(x)

In [None]:
class MLP(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.n_embeddings = cfg['n_embeddings']
        self.projection = nn.Sequential(
            nn.Linear(cfg['emb_dim'], cfg['emb_latent_dim']),
            nn.LayerNorm(cfg['emb_latent_dim'])
        )
        # in_dim = cfg['emb_latent_dim'] * self.n_embeddings + cfg['transf_dim']
        in_dim = cfg['emb_latent_dim'] * (self.n_embeddings[0] + self.n_embeddings[1]) + cfg['transf_dim']
        out_dim = 1 if cfg['task'] == 'regression' else cfg['num_classes']
        self.head = MLPHead(in_dim, out_dim, cfg['use_layer_norm'], cfg['dropout'], cfg['n_hid_layers'])

    # ENB1: unmodified code
    # EMB2: mutated code
    def forward(self, batch):
        if self.n_embeddings[0]:
            em_feat_1 = self.projection(batch[0])
            if self.n_embeddings[1] == True:
                em_feat_2 = self.projection(batch[1])
                if self.n_embeddings[2]:
                    x = torch.cat([em_feat_1, em_feat_2, batch[2]], dim=1)
                else:
                    x = torch.cat([em_feat_1, em_feat_2], dim=1)
            else:
                if self.n_embeddings[2]:
                    x = torch.cat([em_feat_1, batch[1]], dim=1)
                else:
                    x = em_feat_1
        else:
            if self.n_embeddings[1]:
                em_feat_2 = self.projection(batch[0])
                if self.n_embeddings[2]:
                    x = torch.cat([em_feat_2, batch[1]], dim=1)
                else:
                    x = em_feat_2
        return self.head(x).squeeze(-1)

        # Old version:
        # (EMB1, TRANSF) (case 1)
        # (EMB1, EMB2, TRANSF) (case 2)
        
        # em_feat_1 = self.projection(batch[0])
        # if self.n_embeddings == 1:
        #     x = torch.cat([em_feat_1, batch[1]], dim=1)
        # elif self.n_embeddings == 2:
        #     em_feat_2 = self.projection(batch[1])
        #     x = torch.cat([em_feat_1, em_feat_2, batch[2]], dim=1)
        # return self.head(x).squeeze(-1)

## Main Project Pipeline

In [None]:
import random 

class ExperimentRunner:
    def __init__(self, cfg_data, device):
        self.cfg_data = cfg_data
        self.cfg_model = None
        self.device = device
        self._setup_data_loaders()
        self._initialize_model()
    
    def _setup_data_loaders(self):
        self.tr_loader, self.va_loader = get_data_loaders_wandb(self.cfg_data)
        
    def _initialize_model(self):
        self.cfg_model = {
            'n_hid_layers': 3,
            'n_embeddings': self.cfg_data['n_embeddings'],
            'emb_dim': self.tr_loader.dataset.embed_dim, # embedding dimensionality
            'emb_latent_dim': 256, # embedding projection dimensionality
            'transf_dim': self.tr_loader.dataset.tr_dim, # transformation dimensionality
            'model_type': 'mlp', # 'attention', 'mlp'
            'task': self.cfg_data['task'],
            'dropout': 0.2,
            'learning_rate': 1e-3,
            'use_layer_norm': 'batch',
            'weight_decay': 1e-3,
            'max_epochs': 100,
            'num_classes': len(self.cfg_data['classes'])
        }
        
        if self.cfg_model['model_type'] == 'attention':
            self.model = BidirectionalCrossAttention(self.cfg_model).to(self.device)
        elif self.cfg_model['model_type'] == 'mlp':
            self.model = MLP(self.cfg_model).to(self.device)
        print("\n", self.model)
    
    def start_run(self, project_name, group_name, run_name, reinit=False):
        wandb.init(
            project=project_name, 
            group=group_name, 
            name=run_name, 
            reinit=reinit, 
            config={**self.cfg_data, **self.cfg_model}
        )
        wandb.config.update(self.cfg_data)
        wandb.config.update(self.cfg_model)
    
    def train_model(self):
        self.trainer = Trainer(self.device, self.model, self.cfg_model)
        tr_losses, va_losses = self.trainer.train(self.tr_loader, self.va_loader)
        self.model.load_state_dict(torch.load('best_model.pth', weights_only=True))
        wandb.log({"training_loss": tr_losses[-1], "validation_loss": va_losses[-1]})
        return tr_losses, va_losses

    def evaluate_model(self, loader, dataset_type):
        preds, tgts = self.trainer.predict(loader)
        mapping = loader.dataset.loop_mapping if self.cfg_data['min_transformations'] > 2 else None
        unique_classes = tgts.unique().tolist()
        active_classes = [self.cfg_data['classes'][i] for i in unique_classes]
        metrics = eval_metrics(
            preds, tgts, figures=True, mapping=mapping, type=self.cfg_data['task'],
            classes=active_classes
        )
        print(f"{dataset_type} metrics: {metrics}")
        wandb.log({f"{dataset_type}_metrics": metrics})
        
    def sample_data_hyperparameters(self, method='random'):
        return {
            'batch_size': random.choice([128, 256, 512, 1024, 2048, 2196, 3680, 4096]),
        }
        
    def sample_model_hyperparameters(self, method='random'):
        return {
            'learning_rate': random.uniform(1e-5, 1e-2),
            'dropout': random.uniform(0.1, 0.5),
            'n_hid_layers': random.randint(1, 5),
            'emb_latent_dim': random.choice([128, 256, 512]),
            'weight_decay': random.uniform(1e-5, 1e-3),
            'use_layer_norm': random.choice(['batch', 'layer', None])
        }
    
    def run_experiment(self, project_name, group_name, run_prefix, n_runs, n_start=1):
        for r in range(n_start, n_runs+1):
            print(f"Run {r}")
            
            # Sample hyperparameters
            self.cfg_model.update(self.sample_model_hyperparameters())
            self.cfg_data.update(self.sample_data_hyperparameters())
            
            self.start_run(project_name, group_name, f"{run_prefix}_{r}", reinit=True)
            
            self.train_model()
            self.evaluate_model(self.tr_loader, 'train')
            self.evaluate_model(self.va_loader, 'val')
            
            wandb.finish()

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
        
cfg_data = {
    'n_workers': 32,
    'batch_size': 2048,    
    'stratification': 'majority', # 'random', 'binary', 'clustered', 'majority'
    'data_path': '/mnt/fastdata/datasets/lore-loops/web-clang',
    'embedding_model': 'source_coderankembed', # 'llvm_llmcompiler', 'source_codellama', 'source_llmcompiler', 'source_codet5p', 'source_coderankembed'
    'embedding_layer': 'last', # 'last', 'all' (not working yet)
    'n_embeddings': [1, 0, 1], # binary flags for [ref_embedding, transformation_embedding, transformation_encoding]
    'min_transformations': 1, # only loop groups with at least <min> transformations
    'max_transformations': 10000, # only loop groups with at most <max> transformations
    'max_speedup': 2.5, # remove any loop group with a speedup transformation > max_speedup
    'max_source_size': 100000, # only loop groups where reference .C file is at most <max> bytes
    'task': 'classification',  # 'classification', 'regression'
    # [(0, np.float64(0.5)), (2, np.float64(1.5)), (4, np.float64(2.5)), (6, np.float64(3.5))]
    'class_splits': [0, .7, 1, 1.5, 17],
    # 'class_splits': [-100, .5, 0.9999, 1.0001, 1.5, 2.5, 3.5, 100000],
    'classes': ['HSL', 'SL', 'SP', 'HSP'],
    'seed': 4,
    'csv_path': './clang_4.0.0_data_table.csv', # for new data loader
    'benchmark': 'NPB',
    'filters': [('benchmark', '==', 'NPB'), ('application', '==', 'SP')],
}

In [None]:
experiment_runner = ExperimentRunner(cfg_data, device)
experiment_runner.run_experiment(
    project_name='MLFinal',
    group_name='NPB Benchmark, SP App, Experiment 3 (Param Random Search)',
    run_prefix="exp3_run",
    n_runs=40,
    n_start=39
)

In [None]:
##########################################
# REQUIRES A SECOND LOOK
##########################################
class BidirectionalCrossAttention(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.n_embeddings = cfg['n_embeddings']
        self.projection = nn.Sequential(
            nn.Linear(cfg['emb_dim'], cfg['emb_latent_dim']),
            nn.LayerNorm(cfg['emb_latent_dim'])
        )
        latent_dim = cfg['emb_latent_dim'] * self.n_embeddings
        out_dim = 1 if cfg['task'] == 'regression' else cfg['num_classes']
        self.head = MLPHead(latent_dim+cfg['transf_dim'], out_dim, cfg['use_layer_norm'], cfg['dropout'], cfg['n_hid_layers'])
        self.cross_attention1 = nn.MultiheadAttention(embed_dim=latent_dim, num_heads=4, dropout=cfg['dropout'], batch_first=True, kdim=cfg['transf_dim'], vdim=cfg['transf_dim'])
        self.cross_attention2 = nn.MultiheadAttention(embed_dim=cfg['transf_dim'], num_heads=4, dropout=cfg['dropout'], batch_first=True, kdim=latent_dim, vdim=latent_dim)

    def forward(self, batch):
        emb_proj_1 = self.projection(batch[0])
        if self.n_embeddings == 1:
            features1, _ = self.cross_attention1(query=emb_proj_1, key=batch[1], value=batch[1])
            features2, _ = self.cross_attention2(query=batch[1], key=emb_proj_1, value=emb_proj_1)
            return self.head(torch.cat([emb_proj_1+features1, batch[1]+features2], dim=1)).squeeze(-1)
        elif self.n_embeddings == 2:
            emb_proj_2 = self.projection(batch[1])
            embs = torch.cat([emb_proj_1, emb_proj_2], dim=1)
            features1, _ = self.cross_attention1(query=embs, key=batch[2], value=batch[2])
            features2, _ = self.cross_attention2(query=batch[2], key=embs, value=embs)
            return self.head(torch.cat([embs+features1, batch[2]+features2], dim=1)).squeeze(-1)