In [1]:
import os
import json
model_list = [  
                ['mistralai','Mistral-7B-v0.1'],
                ['meta-math','MetaMath-Mistral-7B'],
                ["HuggingFaceH4","zephyr-7b-beta"],
                ['itpossible','Chinese-Mistral-7B-v0.1'],
                ["cognitivecomputations","dolphin-2.6-mistral-7b"],
                ["meta-llama","Meta-Llama-3-8B"],
                ["cognitivecomputations","dolphin-2.9-llama3-8b"],
                ]
model_list = [item[0] + '/' + item[1] for item in model_list]


source_output_path = "/data/home/chensh/projects/LLM_router/datasets/split2_model7"

for llm_number in range(3,4):
    target_output_path = f"/data/home/chensh/projects/LLM_router/datasets/llm_numbers/model_{llm_number}_2"
    os.makedirs(target_output_path)
    source_dataset_list = os.listdir(source_output_path)
    for source_dataset in source_dataset_list:
        with open(os.path.join(source_output_path, source_dataset),'r') as f: 
            source_data = json.load(f)
        target_data = []
        for item in source_data:
            target_item = item
            target_score = {}
            for key, value in item['scores'].items():
                if key in model_list[:llm_number]:
                    target_score[key] = value
            target_item['scores'] = target_score
            target_data.append(target_item)

        with open(os.path.join(target_output_path, source_dataset),'w') as f: 
            json.dump(target_data, f)



In [2]:
import os
import sys
sys.path.append("..")
os.environ["CUDA_VISIBLE_DEVICES"]="2"
from openTSNE import TSNE
import torch

from train_router_mdeberta_v2 import RouterDataset, RouterModule
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer, DebertaV2Model

dataset_paths = ["../datasets/split2_model7/gsm8k-train.json","../datasets/split2_model7/mmlu_train.json","../datasets/split2_model7/humaneval_train.json","../datasets/split2_model7/arc_challenge_train.json","../datasets/split2_model7/cmmlu_train.json"]
data_types = ["multi_attempt", "multi_attempt", "probability", "probability", "probability"]

# dataset_paths = ["../datasets/split2_model7/gsm8k-train.json"]
# data_types = ["multi_attempt"]

tokenizer = AutoTokenizer.from_pretrained("/data/home/chensh/data/huggingface_model/microsoft/mdeberta-v3-base", truncation_side='left', padding=True)
encoder_model = DebertaV2Model.from_pretrained("/data/home/chensh/data/huggingface_model/microsoft/mdeberta-v3-base").to("cuda")

router_datasets = [RouterDataset(data_path, data_type=data_types[i], dataset_id=i, size=2000) for i, data_path in enumerate(dataset_paths)]
for router_dataset in router_datasets:
    router_dataset.register_tokenizer(tokenizer)
router_dataset = ConcatDataset(router_datasets)
router_dataloader = DataLoader(router_dataset, batch_size=64)

router_model = RouterModule(encoder_model, hidden_state_dim=768, node_size=len(router_datasets[0].router_node), similarity_function="cos").to("cpu")

# state_dict = torch.load("/data/home/chensh/projects/LLM_router/logs/router_debug/split2_model5/old/sample_loss_weight_1_cos_top_k_1_last_k_3_learning_rate_0.00005_step_500_t_1/model.pth")
# router_model.load_state_dict(state_dict)
router_model.to('cuda')

all_hidden_states = []
ids = []
with torch.no_grad():
    for i, batch in enumerate(router_dataloader):
        input, _, id = batch
        input.to("cuda")
        # hidden_states =  router_model.backbone(**input)['last_hidden_state'][:,0,:]
        hidden_states =  torch.max(router_model.backbone(**input)['last_hidden_state'], dim=1) 
        ids.append(id)
        all_hidden_states.append(hidden_states[0])
len(all_hidden_states)
all_hidden_states = torch.concat(all_hidden_states)

from MulticoreTSNE import MulticoreTSNE as M_TSNE
from openTSNE import TSNE
np_hidden_states = all_hidden_states.cpu().numpy()

tsne_result2 = M_TSNE(n_components=5, n_jobs=12).fit_transform(np_hidden_states)


from sklearn.cluster import KMeans, DBSCAN
import numpy as np
import random as random

n_clusters_list = [5]

seed = 41
random.seed(seed)
np.random.seed(seed)

for n_clusters in n_clusters_list:
    x = tsne_result2
    kmeans = KMeans(n_clusters=n_clusters, max_iter=1000)

    # 对样本数据进行聚类
    kmeans.fit(x)

    # 获取聚类结果
    labels = kmeans.labels_.tolist()

    import json
    import os

    labels_split = [labels[i*2000: (i+1)*2000] for i in range(len(dataset_paths))]



  return self.fget.__get__(instance, owner)()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
for llm_number in range(3,4):
    target_output_path = f"/data/home/chensh/projects/LLM_router/datasets/llm_numbers/model_{llm_number}_2_cluster"
    os.makedirs(target_output_path, exist_ok=True)
    base_path = f"/data/home/chensh/projects/LLM_router/datasets/llm_numbers/model_{llm_number}_2"
    datasets = ["gsm8k-train.json","mmlu_train.json","humaneval_train.json","arc_challenge_train.json","cmmlu_train.json"]
    

    for i, data_path in enumerate(datasets) :
        base_data_path = os.path.join(base_path, data_path)
        cluster_ids = labels_split[i]
        
        with open(base_data_path, 'r') as f:
            if data_path.endswith('.json'):
                sample_list = json.load(f)
        new_sample_list = []
        for j, sample in enumerate(sample_list):
            if j >= 2000:
                break
            new_sample = sample 
            new_sample['cluster_id'] = cluster_ids[j]
            new_sample_list.append(new_sample)
        with open(os.path.join(target_output_path, data_path), "w" ) as f:
            json.dump(new_sample_list ,f)
