In [1]:
import os
import json
import time

from os import path

import numpy as np
import torch

from torch import nn
from torch import sparse
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import r2_score

In [2]:
class GatedGraphsDataset(Dataset):
    def __init__(self, file, label_from=0, label_to=-1, m=None, d=None):
        self.ids = torch.load(os.path.join(file, "ids.pkl"))
        self.indexes = torch.load(os.path.join(file, "indexes.pkl"))
        self.tokens = torch.load(os.path.join(file, "tokens.pkl"))
        self.types = torch.load(os.path.join(file, "types.pkl"))
        self.labels = torch.load(os.path.join(file, "labels.pkl"))[:,label_from:label_to]
        self.size = len(self.ids)
        self.m = m if m is not None else torch.mean(self.labels, dim=0)
        self.d = d if d is not None else torch.std(self.labels, dim=0).clamp_min(1)
        self.norm_labels = (self.labels - self.m) / self.d

    def __len__(self):
        return self.size

    def __getitem__(self, index):
        return {'indexes': self.indexes[index],
                'tokens': self.tokens[index],
                'types': self.types[index],
                'labels': self.labels[index],
                'norm_labels': self.norm_labels[index],
                'ids': self.ids[index]}

In [3]:
class GGNN(nn.Module):
    def __init__(self, n_tokens, n_types, n_edges, 
                 node_dim, token_dim, type_dim, annotation_dim, message_dim, 
                 n_steps):
        super(GGNN, self).__init__()
        self.n_steps = n_steps
        self.n_edges = n_edges
        self.node_dim = node_dim
        self.message_dim = message_dim
        self.message_generator = nn.Linear(node_dim, message_dim * n_edges)
        self.state_generator = nn.Sequential(
            nn.Linear(type_dim + token_dim, annotation_dim),
            nn.ConstantPad1d((0, node_dim - annotation_dim), 0))
        self.tokens = nn.EmbeddingBag(n_tokens, token_dim, mode='sum')
        self.types = nn.Embedding(n_types, type_dim)
        self.updater = nn.GRUCell(input_size=message_dim, hidden_size=node_dim)

    def forward(self, var_type, node_tokens, mask, adjacency_matrix):
        tokens = self.tokens(node_tokens, per_sample_weights=mask)
        types = self.types(var_type)
        state = self.state_generator(torch.cat([tokens, types], 1))
        for j in range(self.n_steps):
            messages_out = self.message_generator(state).view((-1, self.message_dim))
            messages_in = sparse.mm(adjacency_matrix, messages_out)
            state = self.updater(messages_in, state)
        return state

In [4]:
class MetricsPredictor(nn.Module):
    def __init__(self, n_tokens, n_types, n_edges, node_dim, token_dim, type_dim,
                 annotation_dim, message_dim, n_steps, n_metrics):
        super(MetricsPredictor, self).__init__()
        self.ggnn = GGNN(n_tokens, n_types, n_edges, node_dim, token_dim, type_dim,
                         annotation_dim, message_dim, n_steps)
        self.attention = nn.Sequential(nn.Dropout(p=0.2), nn.Linear(node_dim, 1))
        self.predictor = nn.Sequential(nn.Dropout(p=0.3), nn.Linear(node_dim, n_metrics))

    def forward(self, var_type, node_tokens, mask, adjacency_matrix, lens):
        states = self.ggnn(var_type, node_tokens, mask, adjacency_matrix)
        data = torch.nn.utils.rnn.pad_sequence(torch.split(states, lens.tolist()), batch_first=True)
        weight = F.softmax(self.attention(data), dim=1)
        result = torch.sum(torch.mul(data, weight), dim=1)
        return self.predictor(result)

In [5]:
def combine(indexes, tokens, var_types, n_edges):
    batch_size = len(indexes)
    shifts = torch.zeros(batch_size + 1, dtype=torch.long)
    lens = torch.zeros(batch_size, dtype=torch.long)
    for i, token in enumerate(tokens):
        shifts[i + 1] = shifts[i] + len(token)
        lens[i] = len(token)
    n_nodes = shifts[-1].item()
    shifted_indexes = []
    for i, index in enumerate(indexes):
        result = index.clone()
        result[0, :] += shifts[i]
        result[1, :] += shifts[i] * n_edges
        shifted_indexes.append(result)
    result_indexes = torch.cat(shifted_indexes, dim=1)
    result_matrix = torch.sparse.FloatTensor(result_indexes, torch.ones(len(result_indexes[0])),
                                             (n_nodes, n_nodes * n_edges))
    result_tokens = torch.cat(tokens)
    result_types = torch.cat(var_types)

    result_mask = (result_tokens != 0).float()
    result_mask /= torch.clamp_min_(torch.sum(result_mask, dim=1, keepdim=True), 1)
    return result_matrix, result_tokens, result_mask, result_types, lens

In [6]:
def collate(values):
    return {
        'indexes': [item['indexes'] for item in values],
        'tokens': [item['tokens'] for item in values],
        'types': [item['types'] for item in values],
        'labels': torch.stack([item['labels'] for item in values]),
        'norm_labels': torch.stack([item['norm_labels'] for item in values]),
        'ids': [item['ids'] for item in values]
    }

In [7]:
def train(model, optimizer, loss, train_dataset, val_dataset, out_path, 
          batch_size, n_minibatch, n_edges, n_metrics, n_epochs, device="cuda"):
    
    model.to(device)
    train_loader = DataLoader(train_dataset, batch_size, shuffle=True, collate_fn=collate)
    val_loader = DataLoader(val_dataset, batch_size, shuffle=False, collate_fn=collate)
    
    os.makedirs(out_path, exist_ok=True)

    for epoch in range(n_epochs):
        model.train()
        optimizer.zero_grad()
        total_loss = 0

        for n_batch, sample in enumerate(train_loader):
            matrix, tokens, mask, types, lens = combine(sample['indexes'], sample['tokens'], sample['types'],
                                                        n_edges)
            labels = sample['norm_labels']
            prediction = model(types.to(device), tokens.to(device), mask.to(device),
                                  matrix.to(device), lens.to(device))
            loss_value = loss(prediction, labels.to(device))
            loss_value.backward()
            if n_batch % n_minibatch == 0:
                optimizer.step()
                optimizer.zero_grad()
            total_loss += loss_value.item()
            
        model.eval()
        with torch.no_grad():
            torch.save(model.state_dict(), os.path.join(out_path, f'model_{epoch}.tmp'))
            torch.save(optimizer.state_dict(), os.path.join(out_path, f'opt_{epoch}.tmp'))
            
            total = len(val_dataset)
            predictions = np.zeros((total, n_metrics))
            targets = np.zeros((total, n_metrics))
            ptr = 0
            for sample in val_loader:
                matrix, tokens, mask, types, heads = combine(sample['indexes'], sample['tokens'],
                                                             sample['types'],
                                                             n_edges)
                labels = sample['norm_labels']
                batched_predictions = model(types.to(device), tokens.to(device), mask.to(device),
                                               matrix.to(device), heads.to(device))
                batched_predictions = batched_predictions.cpu().numpy()
                predictions[ptr:ptr + len(batched_predictions)] = batched_predictions
                targets[ptr:ptr + len(labels)] = labels
                ptr += len(batched_predictions)
        print(f'Epoch {epoch}: train_loss={total_loss / len(train_dataset)}, val_r2={r2_score(targets, predictions)}')

In [8]:
def evaluate(model, dataset, batch_size, n_edges, n_metrics, device='cuda'):
    with torch.no_grad():         
        model.eval()   
        total = len(dataset)
        predictions = np.zeros((total, n_metrics))
        targets = np.zeros((total, n_metrics))
        ptr = 0
        for sample in DataLoader(dataset, batch_size, shuffle=False, collate_fn=collate):
            matrix, tokens, mask, types, heads = combine(sample['indexes'], sample['tokens'],
                                                             sample['types'],
                                                             n_edges)
            labels = sample['norm_labels']
            batched_predictions = model(types.to(device), tokens.to(device), mask.to(device),
                                        matrix.to(device), heads.to(device))
            batched_predictions = batched_predictions.cpu().numpy()
            predictions[ptr:ptr + len(batched_predictions)] = batched_predictions
            targets[ptr:ptr + len(labels)] = labels
            ptr += len(batched_predictions)
        return r2_score(targets, predictions), r2_score(targets, predictions, multioutput='raw_values')

In [9]:
DATA_PATH = path.join('..', 'data', 'torch-graphs')
LOGS_V1_PATH = path.join('..', 'logs-v1')
LOGS_V2_PATH = path.join('..', 'logs-v2')

## Experiment №1

Predicting 20 method-level metrics from paper "The Effectiveness of Supervised Machine Learning Algorithms in Predicting Software Refactoring"

In [10]:
train_dataset = GatedGraphsDataset(path.join(DATA_PATH, 'train'), 
                                   label_from=0, label_to=20)
val_dataset = GatedGraphsDataset(path.join(DATA_PATH, 'val'), 
                                 label_from=0, label_to=20, 
                                 m=train_dataset.m, d=train_dataset.d)
test_dataset = GatedGraphsDataset(path.join(DATA_PATH, 'test'), 
                                  label_from=0, label_to=20, 
                                  m=train_dataset.m, d=train_dataset.d)

In [11]:
predictor = MetricsPredictor(3766, 713, 20, 64, 32, 16, 64, 64, 4, 20)
optimizer = torch.optim.Adam(predictor.parameters(), lr=0.001)
loss_function = torch.nn.MSELoss()

In [12]:
train(predictor, optimizer, loss_function, 
      train_dataset, val_dataset, LOGS_V1_PATH,
      128, 10, 20, 20, 50)

Epoch 0: train_loss=0.004699787918190081, val_r2=0.3501567211160092
Epoch 1: train_loss=0.003616500882581565, val_r2=0.44705622389040806
Epoch 2: train_loss=0.0032780510332922038, val_r2=0.45037572158640177
Epoch 3: train_loss=0.0031234110907415475, val_r2=0.5032561314972239
Epoch 4: train_loss=0.0029210608987801177, val_r2=0.5308832557729783
Epoch 5: train_loss=0.0027959459352800108, val_r2=0.5455376680973075
Epoch 6: train_loss=0.0027062198484498843, val_r2=0.5666657940246606
Epoch 7: train_loss=0.00260795406529099, val_r2=0.574898061710804
Epoch 8: train_loss=0.0025677506286193846, val_r2=0.5809345561629747
Epoch 9: train_loss=0.002538991559174749, val_r2=0.5968767424759064
Epoch 10: train_loss=0.002472273406612307, val_r2=0.5979866745263036
Epoch 11: train_loss=0.002474482629436559, val_r2=0.5902181574291406
Epoch 12: train_loss=0.0023927530148275793, val_r2=0.5921291435816454
Epoch 13: train_loss=0.0023364848694198293, val_r2=0.6082421526139516
Epoch 14: train_loss=0.0023414490794

In [13]:
predictor.load_state_dict(torch.load(path.join(LOGS_V1_PATH, f'model_74.tmp')))
evaluate(predictor, test_dataset, 128, 20, 20)

(0.7815155417707412,
 array([ 0.80132784,  0.85873349,  0.77229692,  0.8847502 ,  0.72237458,
         0.87076254,  0.90914218,  0.86489323,  0.84928428,  0.7588517 ,
         0.93320699,  0.61455496,  0.87934504,  0.80410059,  0.77147378,
        -0.07314711,  0.89284019,  0.78164229,  0.88228996,  0.85158721]))

## Experiment №2
Predicting 13 method-level metrics calculated by MetricsReloaded

In [15]:
predictor = MetricsPredictor(3766, 713, 20, 64, 32, 16, 64, 64, 4, 13)
optimizer = torch.optim.Adam(predictor.parameters(), lr=0.001)

In [16]:
train_dataset = GatedGraphsDataset(path.join(DATA_PATH, 'train'), 
                                   label_from=20, label_to=33)
val_dataset = GatedGraphsDataset(path.join(DATA_PATH, 'val'), 
                                 label_from=20, label_to=33, 
                                 m=train_dataset.m, d=train_dataset.d)
test_dataset = GatedGraphsDataset(path.join(DATA_PATH, 'test'), 
                                  label_from=20, label_to=33, 
                                  m=train_dataset.m, d=train_dataset.d)

In [17]:
train(predictor, optimizer, loss_function, 
      train_dataset, val_dataset, LOGS_V2_PATH,
      128, 10, 20, 13, 50)

Epoch 0: train_loss=0.0035109741755458587, val_r2=0.5432975417783402
Epoch 1: train_loss=0.0025585684377230126, val_r2=0.6341584138507037
Epoch 2: train_loss=0.0022635171184727616, val_r2=0.6614981228597719
Epoch 3: train_loss=0.0020588345689431116, val_r2=0.6833023405987686
Epoch 4: train_loss=0.0019307566014970412, val_r2=0.6815910878157574
Epoch 5: train_loss=0.0018401389506510814, val_r2=0.7047077975639016
Epoch 6: train_loss=0.0017427176320668895, val_r2=0.6873186164218281
Epoch 7: train_loss=0.0017384150328531793, val_r2=0.744637409716666
Epoch 8: train_loss=0.00166123433571817, val_r2=0.7513903456652083
Epoch 9: train_loss=0.0015800072028096584, val_r2=0.766439613213781
Epoch 10: train_loss=0.0015911463287228095, val_r2=0.7191693057313218
Epoch 11: train_loss=0.0016550420646859378, val_r2=0.7492241402844712
Epoch 12: train_loss=0.0015657284579050788, val_r2=0.7700433416816624
Epoch 13: train_loss=0.0016032194850870928, val_r2=0.7606430954769287
Epoch 14: train_loss=0.00153752740

In [18]:
predictor.load_state_dict(torch.load(path.join(LOGS_V2_PATH, f'model_46.tmp')))
evaluate(predictor, test_dataset, 128, 20, 13)

(0.8292503230455129,
 array([0.90125511, 0.7649991 , 0.84024208, 0.6153092 , 0.91130423,
        0.86794405, 0.93926821, 0.85808415, 0.8296392 , 0.44913948,
        0.91622912, 0.94355596, 0.94328431]))