In [14]:
import os
import sys
sys.path.append("../..")
os.environ["CUDA_VISIBLE_DEVICES"]="0"
from openTSNE import TSNE
import torch

from train_supervised import RouterDataset, RouterModule
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, DebertaV2Model

dataset_paths = ["../../datasets/split2_model7/mmlu_test.json", "../../datasets/split2_model7/gsm8k-test.json", "../../datasets/split2_model7/cmmlu_test.json","../../datasets/split2_model7/arc_challenge_test.json", "../../datasets/split2_model7/humaneval_test.json", "../../datasets/split2_model7/MATH_prealgebra.json", "../../datasets/split2_model7/mbpp.json", "../../datasets/split2_model7/ceval.json"]
data_types = ["probability", "multi_attempt",  "probability",  "probability" ,"multi_attempt", "multi_attempt", "multi_attempt", "probability"]
device="cuda"

trained_router_path = "/data/home/chensh/projects/LLM_router/logs/paper_result/supervised/dot_lr_5e-5_step_1000_t_1_seed_5/best_training_model.pth"

tokenizer = AutoTokenizer.from_pretrained("/data/home/chensh/data/huggingface_model/microsoft/mdeberta-v3-base", truncation_side='left', padding=True)
encoder_model = DebertaV2Model.from_pretrained("/data/home/chensh/data/huggingface_model/microsoft/mdeberta-v3-base").to("cuda")

router_model = RouterModule(encoder_model, hidden_state_dim=768, node_size=7, similarity_function="dot").to(device)

state_dict = torch.load(trained_router_path)
router_model.load_state_dict(state_dict)




<All keys matched successfully>

In [15]:
import torch.nn as nn
from collections import Counter

def count_tensor_elements(tensor):
    # 将张量展平为一维数组
    flattened_tensor = tensor.view(-1).tolist()

    # 使用Counter统计元素出现次数
    element_counts = Counter(flattened_tensor)

    return element_counts

def evaluation(router_model, dataset_paths, dataset_types, tokenizer, batch_size, device): 
    all_counts = {key:0 for key in range(router_model.node_size)}   
    result = {}
    with torch.no_grad():
        assert len(dataset_paths) == len(dataset_types)
        for index, data_path in enumerate(dataset_paths):
            print(data_path)
            test_dataset = RouterDataset(data_path=data_path)
            test_dataset.register_tokenizer(tokenizer)
            data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
            correct_predict = 0
            correct = 0
            for batch in data_loader:
                inputs, scores, _, _ = batch
                inputs = inputs.to(device)
                scores = scores.to(device)
                x, _ = router_model.forward(**inputs)
                softmax_x = nn.Softmax(dim=1)(x)
                _, max_index = torch.max(softmax_x, dim=1)
                _, target_max_index = torch.max(scores, dim=1)

                counts = count_tensor_elements(max_index)
                for element, count in counts.items():
                    all_counts[element] += count

                equals = max_index.eq(target_max_index)
                correct += equals.sum().item()

                if dataset_types[index] == "probability":
                    mask = torch.zeros_like(scores)
                    mask = mask.scatter_(1, max_index.unsqueeze(1), 1)
                    scores[scores > 0] = 1
                    correct_predict += (scores * mask).sum().item()
                elif dataset_types[index] == "multi_attempt":
                    mask = torch.zeros_like(scores)
                    mask = mask.scatter_(1, max_index.unsqueeze(1), 1)
                    correct_predict += (scores * mask).sum().item()

            acc_predict = correct_predict/len(test_dataset)
            acc = correct/len(test_dataset)
            print(f"acc_{data_path}:", acc_predict)
            print("acc", acc)
            result[data_path] = [acc, acc_predict]
    return result, all_counts

In [16]:
all_acount_list = []
for i in range(len(dataset_paths)):
   result, acount = evaluation(router_model, [dataset_paths[i]], [data_types[i]], tokenizer, 32, device)
   all_acount_list.append(acount)

../../datasets/split2_model7/mmlu_test.json
acc_../../datasets/split2_model7/mmlu_test.json: 0.6047946831236648
acc 0.22193211488250653
../../datasets/split2_model7/gsm8k-test.json
acc_../../datasets/split2_model7/gsm8k-test.json: 0.6668688403195373
acc 0.27824109173616374
../../datasets/split2_model7/cmmlu_test.json
acc_../../datasets/split2_model7/cmmlu_test.json: 0.45266187050359713
acc 0.2489208633093525
../../datasets/split2_model7/arc_challenge_test.json
acc_../../datasets/split2_model7/arc_challenge_test.json: 0.53125
acc 0.20738636363636365
../../datasets/split2_model7/humaneval_test.json
acc_../../datasets/split2_model7/humaneval_test.json: 0.44285711950185347
acc 0.2857142857142857
../../datasets/split2_model7/MATH_prealgebra.json
acc_../../datasets/split2_model7/MATH_prealgebra.json: 0.34443168771526983
acc 0.09873708381171067
../../datasets/split2_model7/mbpp.json
acc_../../datasets/split2_model7/mbpp.json: 0.4110000057220459
acc 0.254
../../datasets/split2_model7/ceval.jso

In [18]:
import numpy as np

distribution_matrix = np.array( [list(dict_item.values()) for dict_item in all_acount_list])
distribution_matrix = distribution_matrix / np.sum(distribution_matrix, axis=1, keepdims=True)
distribution_matrix = np.around(distribution_matrix, 2) 
distribution_matrix = distribution_matrix.T
distribution_matrix[[2,3]] = distribution_matrix[[3,2]]
print(distribution_matrix)

[[0.16 0.02 0.13 0.25 0.2  0.06 0.44 0.06]
 [0.02 0.39 0.   0.02 0.   0.18 0.01 0.  ]
 [0.32 0.01 0.45 0.38 0.06 0.09 0.06 0.54]
 [0.03 0.02 0.08 0.04 0.02 0.02 0.   0.07]
 [0.12 0.04 0.05 0.14 0.37 0.16 0.   0.14]
 [0.14 0.04 0.08 0.09 0.02 0.09 0.03 0.07]
 [0.21 0.49 0.2  0.09 0.33 0.39 0.46 0.11]]


array([[0.16, 0.02, 0.13, 0.25, 0.2 , 0.06, 0.44, 0.06],
       [0.02, 0.39, 0.  , 0.02, 0.  , 0.18, 0.01, 0.  ],
       [0.32, 0.01, 0.45, 0.38, 0.06, 0.09, 0.06, 0.54],
       [0.03, 0.02, 0.08, 0.04, 0.02, 0.02, 0.  , 0.07],
       [0.12, 0.04, 0.05, 0.14, 0.37, 0.16, 0.  , 0.14],
       [0.14, 0.04, 0.08, 0.09, 0.02, 0.09, 0.03, 0.07],
       [0.21, 0.49, 0.2 , 0.09, 0.33, 0.39, 0.46, 0.11]])

In [32]:

import json

data_len_list = []
for file_path in dataset_paths:
    with open(file_path, 'r') as f:
        data_list = json.load(f)
        data_len_list.append(len(data_list))

all_sample_distribution = [0, 0, 0, 0, 0, 0, 0]
for j, per_model_distribution in enumerate(distribution_matrix):
    for i in range(5):
        all_sample_distribution[j] += per_model_distribution[i] * data_len_list[i]

print(all_sample_distribution)
all_sample_distribution = np.array(all_sample_distribution)
all_sample_distribution = all_sample_distribution / np.sum(all_sample_distribution) 
print(all_sample_distribution)

np.sum( all_sample_distribution * np.array([416.4469283, 433.7455524, 404.0106289, 426.7654369, 414.8200222, 379.9586915, 319.6867065]))


# np.sum( all_sample_distribution * np.array([258.8383127, 247.6017654, 257.8926255, 263.7625816, 192.1132418, 236.866828, 188.8286372]))


[1250.01, 605.7099999999999, 3061.8000000000006, 445.83, 799.48, 953.24, 2273.89]
[0.13312197 0.06450613 0.32607168 0.04747944 0.085142   0.10151694
 0.24216184]


386.7232619909813

In [24]:
import os
import sys
sys.path.append("../..")
os.environ["CUDA_VISIBLE_DEVICES"]="0"
from openTSNE import TSNE
import torch

from train_lora_retriever import RouterDataset, RouterModule
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, DebertaV2Model

dataset_paths = ["../../datasets/split2_model7/mmlu_test.json", "../../datasets/split2_model7/gsm8k-test.json", "../../datasets/split2_model7/cmmlu_test.json","../../datasets/split2_model7/arc_challenge_test.json", "../../datasets/split2_model7/humaneval_test.json", "../../datasets/split2_model7/MATH_prealgebra.json", "../../datasets/split2_model7/mbpp.json", "../../datasets/split2_model7/ceval.json"]
data_types = ["probability", "multi_attempt",  "probability",  "probability" ,"multi_attempt", "multi_attempt", "multi_attempt", "probability"]
device="cuda"

trained_router_path = "/data/home/chensh/projects/LLM_router/logs/lora_retriever/lr_5e-5_step_1000_t_1_seed_0/best_training_model.pth"

tokenizer = AutoTokenizer.from_pretrained("/data/home/chensh/data/huggingface_model/microsoft/mdeberta-v3-base", truncation_side='left', padding=True)
encoder_model = DebertaV2Model.from_pretrained("/data/home/chensh/data/huggingface_model/microsoft/mdeberta-v3-base").to("cuda")

router_model = RouterModule(encoder_model, hidden_state_dim=768, node_size=7, similarity_function="cos").to(device)

state_dict = torch.load(trained_router_path)
router_model.load_state_dict(state_dict)




<All keys matched successfully>

In [25]:
import torch.nn as nn
from collections import Counter

def count_tensor_elements(tensor):
    # 将张量展平为一维数组
    flattened_tensor = tensor.view(-1).tolist()

    # 使用Counter统计元素出现次数
    element_counts = Counter(flattened_tensor)

    return element_counts

def evaluation(router_model, dataset_paths, dataset_types, tokenizer, batch_size, device, ref_data_path, cluster_model_map):   
    # get the embedding of each cluster
    all_counts = {key:0 for key in range(router_model.node_size)}   
    result = {}
    with torch.no_grad():
        # get the embeddings for each cluster
        cluster_embeddings = []
        for index, data_path in enumerate(ref_data_path):
            test_dataset = RouterDataset(data_path=data_path)
            test_dataset.register_tokenizer(tokenizer)
            data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
            temp_embeddings = []
            for i, batch in enumerate(data_loader):
                if i > 5:
                    break
                inputs, scores, _, _ = batch
                inputs = inputs.to(device)
                scores = scores.to(device)
                _, hidden_state = router_model.forward(**inputs)
                temp_embeddings.append(hidden_state)
            temp_embeddings = torch.concat(temp_embeddings, dim=0)
            cluster_embedding = torch.mean(temp_embeddings, dim=0)
            cluster_embeddings.append(cluster_embedding)
        cluster_embeddings = torch.stack(cluster_embeddings)

        assert len(dataset_paths) == len(dataset_types)
        for index, data_path in enumerate(dataset_paths):
            test_dataset = RouterDataset(data_path=data_path)
            test_dataset.register_tokenizer(tokenizer)
            data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
            correct_predict = 0
            correct = 0
            for batch in data_loader:
                inputs, scores, _, _ = batch
                inputs = inputs.to(device)
                scores = scores.to(device)
                _, hidden_state = router_model.forward(**inputs)
                x = router_model.compute_similarity(hidden_state, cluster_embeddings)
                softmax_x = nn.Softmax(dim=1)(x)
                _, max_index = torch.max(softmax_x, dim=1)
                
                cluster_model_map = torch.tensor(cluster_model_map).type_as(max_index)
                maped_max_index = torch.gather(cluster_model_map, dim=0, index=max_index)
                
                counts = count_tensor_elements(maped_max_index)
                for element, count in counts.items():
                    all_counts[element] += count

                _, target_max_index = torch.max(scores, dim=1)
                equals = maped_max_index.eq(target_max_index)
                correct += equals.sum().item()

                if dataset_types[index] == "probability":
                    mask = torch.zeros_like(scores)
                    mask = mask.scatter_(1, maped_max_index.unsqueeze(1), 1)
                    scores[scores > 0] = 1
                    correct_predict += (scores * mask).sum().item()
                elif dataset_types[index] == "multi_attempt":
                    mask = torch.zeros_like(scores)
                    mask = mask.scatter_(1, maped_max_index.unsqueeze(1), 1)
                    correct_predict += (scores * mask).sum().item()

            acc_predict = correct_predict/len(test_dataset)
            acc = correct/len(test_dataset)
            print(f"acc_{data_path}:", acc_predict)
            print("acc", acc)
            result[data_path] = [acc, acc_predict]
    return result, all_counts


In [26]:
all_acount_list = []
for i in range(len(dataset_paths)):
   result, acount = evaluation(router_model, [dataset_paths[i]], [data_types[i]], tokenizer, 32, device, cluster_model_map = [3, 6, 5, 5, 4], ref_data_path=["../../datasets/lora_retriever/cluster_0.json","../../datasets/lora_retriever/cluster_1.json", "../../datasets/lora_retriever/cluster_2.json", "../../datasets/lora_retriever/cluster_3.json","../../datasets/lora_retriever/cluster_4.json",])
   all_acount_list.append(acount)

  cluster_model_map = torch.tensor(cluster_model_map).type_as(max_index)


acc_../../datasets/split2_model7/mmlu_test.json: 0.6347021125089011
acc 0.04984571564206029
acc_../../datasets/split2_model7/gsm8k-test.json: 0.6862016656888625
acc 0.1425322213798332
acc_../../datasets/split2_model7/cmmlu_test.json: 0.5176978417266187
acc 0.07942446043165467
acc_../../datasets/split2_model7/arc_challenge_test.json: 0.5767045454545454
acc 0.2840909090909091
acc_../../datasets/split2_model7/humaneval_test.json: 0.4489795918367347
acc 0.22448979591836735
acc_../../datasets/split2_model7/MATH_prealgebra.json: 0.35017221584385766
acc 0.04477611940298507
acc_../../datasets/split2_model7/mbpp.json: 0.4300000019073486
acc 0.088
acc_../../datasets/split2_model7/ceval.json: 0.5200594353640416
acc 0.07800891530460624


In [27]:
import numpy as np

distribution_matrix2 = np.array( [list(dict_item.values()) for dict_item in all_acount_list])
distribution_matrix2 = distribution_matrix2 / np.sum(distribution_matrix2, axis=1, keepdims=True)
distribution_matrix2 = np.around(distribution_matrix2, 2) 
distribution_matrix2 = distribution_matrix2.T
distribution_matrix2[[2,3]] = distribution_matrix2[[3,2]]
print(distribution_matrix2)



[[0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.98 0.   0.01 0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.28 0.05 0.   0.02 0.88 0.13 0.03 0.  ]
 [0.72 0.   1.   0.   0.12 0.29 0.97 1.  ]
 [0.   0.95 0.   0.   0.   0.58 0.   0.  ]]


In [30]:

import json

data_len_list = []
for file_path in dataset_paths:
    with open(file_path, 'r') as f:
        data_list = json.load(f)
        data_len_list.append(len(data_list))

all_sample_distribution = [0, 0, 0, 0, 0, 0, 0]
for j, per_model_distribution in enumerate(distribution_matrix2):
    for i in range(5, 8):
        all_sample_distribution[j] += per_model_distribution[i] * data_len_list[i]

print(all_sample_distribution)
all_sample_distribution = np.array(all_sample_distribution)
all_sample_distribution = all_sample_distribution / np.sum(all_sample_distribution) 
print(all_sample_distribution)

# np.sum( all_sample_distribution * np.array([416.4469283, 433.7455524, 404.0106289, 426.7654369, 414.8200222, 379.9586915, 319.6867065]))

np.sum( all_sample_distribution * np.array([258.8383127, 247.6017654, 257.8926255, 263.7625816, 192.1132418, 236.866828, 188.8286372]))



[0.0, 0.0, 8.71, 0.0, 128.23000000000002, 2083.59, 505.17999999999995]
[0.         0.         0.0031955  0.         0.04704462 0.76442101
 0.18533887]


225.9252564863228