In [2]:
import importlib
import random

import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import precision_score, recall_score
import os

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from MethodInfo import MethodInfo
import MutantInfo
importlib.reload(MutantInfo)
from MutantInfo import MutantInfo
import GraphModelContrastivev5
importlib.reload(GraphModelContrastivev5)
from GraphModelContrastivev5 import GraphModelContrastivev5, KFilter, KFilter_Multihead,KNFilter
import ChangeImpactDataBuilder
from ChangeImpactDataBuilder import ChangeImpactDataBuilder
from ChangeImpactMutantIndicesDataset import ChangeImpactMutantIndicesDataset
from ChangeImpactNodeIndicesDataset import ChangeImpactNodeIndicesDataset
from util import my_collate_fn
import config
importlib.reload(config)
from config import *
import copy

In [3]:
writer = SummaryWriter(log_dir=tensorboard_log_dir)

In [4]:
fix_indices=True
device="cuda"

load_model_epoch=0
new_filter=False

dataset_split=0.8
k_set=set([5,10,20,30,40])
debug=False

mutant_batch_size_train=400
node_batch_size_train=7000
mutant_batch_size_test=400
node_batch_size_test=7000

infonce_temperature=0.1

clip_grad=False
max_norm=20

epoches=500
grad_step=0

In [None]:
method_info=MethodInfo()
mutant_info=MutantInfo(method_info,debug=False,
                       mutant_info_dir=mutant_info_dir,
                       mutant_execrecord_home=mutant_execrecord_home)
mutant_info.info()

In [None]:
if fix_indices:
    train_indices=np.load(model_state_home+f"{path_sep}train_indices.npy").tolist()
    test_indices=np.load(model_state_home+f"{path_sep}test_indices.npy").tolist()
    print(f"load model from epoch: {load_model_epoch}\n")
else:
    sample_indices=list(range(mutant_info.n_change))
    random.shuffle(sample_indices)
    split_index=int(mutant_info.n_change*dataset_split)
    train_indices=sample_indices[:split_index]
    test_indices=sample_indices[split_index:]
    np.save(model_state_home+f"{path_sep}train_indices.npy",np.array(train_indices))
    np.save(model_state_home+f"{path_sep}test_indices.npy",np.array(test_indices))

if debug:
    train_indices=[i for i in range(800)]
    test_indices=[i+200 for i in range(200)]

In [7]:
data_builder=ChangeImpactDataBuilder(method_info,mutant_info)

node_dataset=ChangeImpactNodeIndicesDataset(method_info)
node_loader_train=DataLoader(node_dataset,batch_size=node_batch_size_train,shuffle=True,collate_fn=my_collate_fn)

node_dataset_test=ChangeImpactNodeIndicesDataset(method_info)
node_loader_test=DataLoader(node_dataset,batch_size=node_batch_size_test,collate_fn=my_collate_fn)

mutant_dataset_train=ChangeImpactMutantIndicesDataset(train_indices)
mutant_loader_train=DataLoader(mutant_dataset_train,batch_size=mutant_batch_size_train,shuffle=True,collate_fn=my_collate_fn)

mutant_dataset_test=ChangeImpactMutantIndicesDataset(test_indices)
mutant_loader_test=DataLoader(mutant_dataset_test,batch_size=mutant_batch_size_test,collate_fn=my_collate_fn)

In [None]:
# model = GraphModelContrastivev5(fusion_type="Union")
# model = GraphModelContrastivev5(fusion_type="Intersection")
model = GraphModelContrastivev5(fusion_type="Attention")
# model_filter = KFilter(h_token=512*2)
model_filter = KFilter(h_token=768)
kn_filter:bool=False

if load_model_epoch != -1:
    # cache_model_state_home=model_state_home
    # cache_load_model_epoch=load_model_epoch
    # model_state_home=f".{path_sep}state"+f"{path_sep}"+"commons-collections"
    # load_model_epoch=183
    # checkpoint = torch.load(model_state_home + f"{path_sep}checkpoint_{load_model_epoch}.pth")
    checkpoint = torch.load(f"D:\workspace\coding\python\defect2\contrastivemodel_serverv3\state\commons-lang\checkpoint_kfilter_pick01.pth")
    # model_state_home=cache_model_state_home
    # load_model_epoch=cache_load_model_epoch
    model.load_state_dict(checkpoint['model_state_dict'])
    if "filter_type" in checkpoint and checkpoint['filter_type'] is type(model_filter) and not new_filter:
        try:
            model_filter.load_state_dict(checkpoint['model_filter_state_dict'])
        except:
            model_filter.initialize_model()
    else:
        model_filter.initialize_model()
    grad_step=checkpoint["grad_step"]
else:
    model.initialize_model()
    model_filter.initialize_model()

model.to(device)
model_filter.to(device)
no_decay = ['bias']
param_groups = [
    {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 1e-4},
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
]
param_groups_filter = [
    {"params": [p for n, p in model_filter.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": 1e-4},
    {"params": [p for n, p in model_filter.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
]
optimizer = optim.Adam(param_groups, lr=0.001)
optimizer_filter = optim.Adam(param_groups_filter, lr=0.001)

if load_model_epoch != -1:
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if "filter_type" in checkpoint and checkpoint['filter_type'] is type(model_filter) and not new_filter:
        try:
            optimizer_filter.load_state_dict(checkpoint['optimizer_filter_state_dict'])
        except:
            pass

if load_model_epoch!=-1:
    start_epoch=load_model_epoch+1
else:
    start_epoch=0

total_batch_train=len(mutant_loader_train)*len(node_loader_train)
total_batch_test=len(mutant_loader_test)*len(node_loader_test)

In [9]:
def smooth_binary_labels(y_true, smoothing=0.1):
    return y_true * (1 - smoothing) + 0.5 * smoothing


def evaluate_limit(node_predict_indexs:torch.Tensor,
                   node_predict_labels:torch.Tensor,
                   node_predict_types:torch.Tensor,
                   similarities:torch.Tensor,
                   node_mutant_predict_indexs:list,
                   node_predict_indexs_origintorch:torch.Tensor,
                   in_callgraph:bool,
                   in_history:bool,
                   training:bool=True,
                   model_filter=None,
                   optimizer_filter:optim.Optimizer=None,
                   target_embeddings:torch.Tensor=None,
                   source_embeddings:torch.Tensor=None,
                   filter:bool=False,
                   filter_train_step:int=0):
    if training:
        model_filter.train()
    else:
        model_filter.eval()
        
    precision={k:[] for k in k_set}
    recall={k:[] for k in k_set}
    f1score={k:[] for k in k_set}
    k_fp={k:0 for k in k_set}
    k_fn={k:0 for k in k_set}
    k_tp={k:0 for k in k_set}
    loss_bi_logs={k:0 for k in k_set}

    unique_types = torch.unique(node_predict_types)
    node_predict_labels=node_predict_labels.to(similarities.device)
    optimizer_filter.zero_grad()
    for node_type in unique_types:
        indices = (node_predict_types == node_type).nonzero(as_tuple=True)[0]

        mutant_index=node_mutant_predict_indexs[indices[0]]
        candidate_indexs=mutant_info.get_candidate_node(mutant_index,
                                                        in_callgraph=in_callgraph,
                                                        in_history=in_history)
        
        candidate_set = set(candidate_indexs)
        candidate_mask = torch.tensor([idx.item() in candidate_set for idx in node_predict_indexs_origintorch[indices]], device=indices.device,dtype=torch.bool)
        
        filtered_indices = indices[candidate_mask]

        current_similarities = similarities[filtered_indices]
        current_labels = node_predict_labels[filtered_indices]
        cur_embeddings = target_embeddings[filtered_indices]

        sorted_indices = torch.argsort(current_similarities, descending=True)

        for k in k_set:
            top_k_indices = sorted_indices[:k]
            top_k_labels = current_labels[top_k_indices].view(-1).float()

            if training and (k == 40 or kn_filter):
                model_filter.train()
            elif training and k != 40:
                model_filter.eval()

            if len(sorted_indices)!=0:
                if training and (k == 40 or kn_filter):
                    logits_bi = model_filter.forward_bi(cur_embeddings[top_k_indices],source_embeddings[node_type],k)
                else:
                    with torch.no_grad():
                        logits_bi = model_filter.forward_bi(cur_embeddings[top_k_indices],source_embeddings[node_type],k)
                pred_labels = (torch.sigmoid(logits_bi) > 0.5)
                pos_weight = torch.tensor([3.0], device=logits_bi.device)
                if training:
                    loss_bi:torch.Tensor = F.binary_cross_entropy_with_logits(logits_bi,smooth_binary_labels(top_k_labels),
                                                                            pos_weight=pos_weight)
                else:
                    loss_bi:torch.Tensor = F.binary_cross_entropy_with_logits(logits_bi,top_k_labels,
                                                                          pos_weight=pos_weight)
                    
                if training and (k==40 or kn_filter):
                    loss_bi.backward()
                loss_bi_logs[k]+=loss_bi.item()

            if filter and len(sorted_indices)!=0:
                tp = int(torch.sum(top_k_labels[pred_labels]).item())
                fp = int((pred_labels.sum() - tp).item())
                fn = int(torch.sum(node_predict_labels[indices]).item() - tp)
            else:
                tp = int(torch.sum(top_k_labels).item())
                fp = int(top_k_indices.shape[0] - tp)
                fn = int(torch.sum(node_predict_labels[indices]).item() - tp)
            
            if (tp+fp)==0:
                if (tp+fn)==0:
                    precision_k = 1.0
                else:
                    precision_k = 0.0
            else:
                precision_k = tp / (tp + fp)
            precision[k].append(precision_k)

            if (tp + fn)!=0:
                recall_k = tp / (tp + fn)
                recall[k].append(recall_k)
                if (precision_k+recall_k)==0:
                    f1score_k = 0.0
                else:
                    f1score_k = 2 * (precision_k * recall_k) / (precision_k + recall_k)
                f1score[k].append(f1score_k)
            
            k_fp[k]+=fp
            k_fn[k]+=fn
            k_tp[k]+=tp

    if training:
        total_norm = 0.0
        for name, param in model_filter.named_parameters():
            if param.grad is not None:
                total_norm += (param.grad.data.norm(2).item()) ** 2
        total_norm = total_norm ** 0.5
        writer.add_scalar("Gradients/Filter/Total_Norm", total_norm, filter_train_step)
        optimizer_filter.step()

    return precision, recall, f1score, k_fp, k_fn, k_tp, loss_bi_logs

In [10]:
mutant_info.parse_mutant_impact_history(train_indices)

In [11]:
step=0
info_train_step=0

In [None]:
print(f"start from epoch: {start_epoch}\n")
for epoch in tqdm(range(start_epoch,epoches)):
    if (epoch%4) in [0,1]:
        filter=False
        training_filter=False
    else:
        filter=True
        training_filter=True
    filter=True
    training_filter=True
    iter_loss_infonce=0
    if training_filter:
        model.eval()
    else:
        model.train()
    loss_bi_logs={k:0 for k in k_set}
    for mutant_indexs in tqdm(mutant_loader_train):
        for node_indexs in node_loader_train:
            change_embeddings,node_embeddings,edge_indexs,node_predict_indexs,node_predict_labels,node_predict_types,node_change_indexs,node_mutant_predict_indexs,node_predict_indexs_origin,st_embeddings=data_builder.build_batch_data(mutant_indexs,node_indexs)
            change_embeddings=change_embeddings.to(device)
            node_embeddings=node_embeddings.to(device)
            st_embeddings=st_embeddings.to(device)

            edge_indexs=edge_indexs.to(device)

            if training_filter:
                with torch.no_grad():
                    loss_infonce,similarities,target_embeddings,source_embeddings=model.forward(change_embeddings,node_embeddings,
                                                                                                edge_indexs,node_predict_indexs,
                                                                                                node_predict_types,
                                                                                                node_predict_labels,node_change_indexs,
                                                                                                st_embeddings,
                                                                                                infonce_temperature=infonce_temperature,
                                                                                                )
            else:
                loss_infonce,similarities,target_embeddings,source_embeddings=model.forward(change_embeddings,node_embeddings,
                                                                                            edge_indexs,node_predict_indexs,
                                                                                            node_predict_types,
                                                                                            node_predict_labels,node_change_indexs,
                                                                                            st_embeddings,
                                                                                            infonce_temperature=infonce_temperature,
                                                                                            )
            
            if not training_filter:
                optimizer.zero_grad()
                loss_infonce.backward()
                total_norm = 0.0
                for name, param in model.named_parameters():
                    if param.grad is not None:
                        total_norm += (param.grad.data.norm(2).item()) ** 2
                total_norm = total_norm ** 0.5
                writer.add_scalar("Gradients/Info/Total_Norm", total_norm, info_train_step)
                info_train_step+=1
                if clip_grad:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
                optimizer.step()
            
            iter_loss_infonce+=loss_infonce.item()

            if training_filter:
                _,_,_,_,_,_,loss_bi_logs_cur=evaluate_limit(node_predict_indexs,node_predict_labels,
                                                            node_predict_types,similarities.squeeze(-1),
                                                            node_mutant_predict_indexs,
                                                            node_predict_indexs_origin,
                                                            in_callgraph=True,
                                                            in_history=True,
                                                            training=training_filter,
                                                            model_filter=model_filter,
                                                            optimizer_filter=optimizer_filter,
                                                            target_embeddings=target_embeddings,
                                                            source_embeddings=source_embeddings,
                                                            filter=filter,
                                                            filter_train_step=step)

                for k in k_set:
                    loss_bi_logs[k]+=loss_bi_logs_cur[k]

                step+=1
    
    iter_loss_infonce=iter_loss_infonce/total_batch_train
    writer.add_scalar("Loss/train/infonce", iter_loss_infonce, epoch)
    print(f"epoch {epoch} train loss infonce: {iter_loss_infonce}")
    if training_filter:
        for k in k_set:
            loss_bi_logs[k]=loss_bi_logs[k]/total_batch_train
            writer.add_scalar(f"Loss/train/bi-{k}", loss_bi_logs[k], epoch)
            print(f"epoch {epoch} train loss {k}-bi: {loss_bi_logs[k]}")

    # test
    iter_loss_infonce=0
    model.eval()
    with torch.no_grad():
        if (epoch+1)%1==0:
            all_precision={k:[] for k in k_set}
            all_recall={k:[] for k in k_set}
            k_fp={k:0 for k in k_set}
            k_fn={k:0 for k in k_set}
            k_tp={k:0 for k in k_set}
            loss_bi_logs={k:0 for k in k_set}
            for mutant_indexs in tqdm(mutant_loader_test):
                for node_indexs in node_loader_test:
                    change_embeddings,node_embeddings,edge_indexs,node_predict_indexs,node_predict_labels,node_predict_types,node_change_indexs,node_mutant_predict_indexs,node_predict_indexs_origin,st_embeddings=data_builder.build_batch_data(mutant_indexs,node_indexs)
                    change_embeddings=change_embeddings.to(device)
                    st_embeddings=st_embeddings.to(device)
                    node_embeddings=node_embeddings.to(device)
                    edge_indexs=edge_indexs.to(device)

                    loss_infonce,similarities,target_embeddings,source_embeddings=model.forward(change_embeddings,node_embeddings,
                                                                                                edge_indexs,node_predict_indexs,
                                                                                                node_predict_types,
                                                                                                node_predict_labels,node_change_indexs,
                                                                                                st_embeddings,
                                                                                                infonce_temperature=infonce_temperature)

                    precision,recall,f1socre,cur_k_fp,cur_k_fn,cur_k_tp,loss_bi_logs_cur=evaluate_limit(node_predict_indexs,node_predict_labels,
                                                                                                        node_predict_types,similarities.squeeze(-1),
                                                                                                        node_mutant_predict_indexs,
                                                                                                        node_predict_indexs_origin,
                                                                                                        in_callgraph=True,
                                                                                                        in_history=True,
                                                                                                        training=False,
                                                                                                        model_filter=model_filter,
                                                                                                        optimizer_filter=optimizer_filter,
                                                                                                        target_embeddings=target_embeddings,
                                                                                                        source_embeddings=source_embeddings,
                                                                                                        filter=filter)
                    
                    for k in k_set:
                        all_precision[k]+=precision[k]
                        all_recall[k]+=recall[k]
                        k_fp[k]+=cur_k_fp[k]
                        k_fn[k]+=cur_k_fn[k]
                        k_tp[k]+=cur_k_tp[k]
                        loss_bi_logs[k]+=loss_bi_logs_cur[k]

                    iter_loss_infonce+=loss_infonce.item()

            iter_loss_infonce=iter_loss_infonce/total_batch_test
            print(f"epoch {epoch} test loss infonce: {iter_loss_infonce}")
            writer.add_scalar("Loss/test/infonce", iter_loss_infonce, epoch)

            for k in k_set:
                loss_bi_logs[k]=loss_bi_logs[k]/total_batch_test
                writer.add_scalar(f"Loss/test/bi-{k}", loss_bi_logs[k], epoch)
                print(f"epoch {epoch} test loss {k}-bi: {loss_bi_logs[k]}")
                precision=sum(all_precision[k])/len(all_precision[k])
                recall=sum(all_recall[k])/len(all_recall[k])
                fscore=2*precision*recall/(precision+recall)
                print(f"k [{k}] FP: {k_fp[k]}")
                print(f"k [{k}] FN: {k_fn[k]}")
                print(f"k [{k}] TP: {k_tp[k]}")
                print(f"k [{k}] precision: {precision}")
                print(f"k [{k}] recall: {recall}")
                print(f"k [{k}] fscore: {fscore}\n")
                writer.add_scalar(f"Metric/test/{k}/Precision", precision, epoch)
                writer.add_scalar(f"Metric/test/{k}/Recall", recall, epoch)
                writer.add_scalar(f"Metric/test/{k}/FScore", fscore, epoch)
            
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'grad_step': grad_step,
        'filter_type': type(model_filter),
        'model_filter_state_dict': model_filter.state_dict(),
        'optimizer_filter_state_dict': optimizer_filter.state_dict(),
    }, model_state_home + f"{path_sep}checkpoint_{epoch}.pth")
    print(f"Model state save to: {model_state_home}{path_sep}checkpoint_{epoch}.pth")

In [None]:
pick_filter=model_filter

model.to(device)
pick_filter.to(device)

In [None]:
iter_loss_infonce=0
model.eval()
all_precision={k:[] for k in k_set}
all_recall={k:[] for k in k_set}
k_fp={k:0 for k in k_set}
k_fn={k:0 for k in k_set}
k_tp={k:0 for k in k_set}
loss_bi_logs={k:0 for k in k_set}
for mutant_indexs in tqdm(mutant_loader_test):
    for node_indexs in node_loader_test:
        change_embeddings,node_embeddings,edge_indexs,node_predict_indexs,node_predict_labels,node_predict_types,node_change_indexs,node_mutant_predict_indexs,node_predict_indexs_origin,st_embeddings=data_builder.build_batch_data(mutant_indexs,node_indexs)
        change_embeddings=change_embeddings.to(device)
        st_embeddings=st_embeddings.to(device)
        node_embeddings=node_embeddings.to(device)
        edge_indexs=edge_indexs.to(device)
        loss_infonce,similarities,target_embeddings,source_embeddings=model.forward(change_embeddings,node_embeddings,
                                                                                    edge_indexs,node_predict_indexs,
                                                                                    node_predict_types,
                                                                                    node_predict_labels,node_change_indexs,
                                                                                    st_embeddings,
                                                                                    infonce_temperature=infonce_temperature)
        precision,recall,f1socre,cur_k_fp,cur_k_fn,cur_k_tp,loss_bi_logs_cur=evaluate_limit(node_predict_indexs,node_predict_labels,
                                                                                            node_predict_types,similarities.squeeze(-1),
                                                                                            node_mutant_predict_indexs,
                                                                                            node_predict_indexs_origin,
                                                                                            in_callgraph=True,
                                                                                            in_history=True,
                                                                                            training=False,
                                                                                            model_filter=model_filter,
                                                                                            optimizer_filter=optimizer_filter,
                                                                                            target_embeddings=target_embeddings,
                                                                                            source_embeddings=source_embeddings,
                                                                                            filter=False)
    
    for k in k_set:
        all_precision[k]+=precision[k]
        all_recall[k]+=recall[k]
        k_fp[k]+=cur_k_fp[k]
        k_fn[k]+=cur_k_fn[k]
        k_tp[k]+=cur_k_tp[k]
        loss_bi_logs[k]+=loss_bi_logs_cur[k]
    iter_loss_infonce+=loss_infonce.item()
iter_loss_infonce=iter_loss_infonce/total_batch_test


In [None]:
for k in k_set:
    precision=sum(all_precision[k])/len(all_precision[k])
    recall=sum(all_recall[k])/len(all_recall[k])
    fscore=2*precision*recall/(precision+recall)
    print(f"k [{k}] FP: {k_fp[k]}")
    print(f"k [{k}] FN: {k_fn[k]}")
    print(f"k [{k}] TP: {k_tp[k]}")
    print(f"k [{k}] precision: {precision}")
    print(f"k [{k}] recall: {recall}")
    print(f"k [{k}] fscore: {fscore}\n")