In [1]:
!git clone https://github.com/princeton-nlp/MeZO.git

Cloning into 'MeZO'...
remote: Enumerating objects: 173, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 173 (delta 45), reused 30 (delta 30), pack-reused 98 (from 1)[K
Receiving objects: 100% (173/173), 432.68 KiB | 8.16 MiB/s, done.
Resolving deltas: 100% (88/88), done.


In [None]:
# Install the latest compatible PyTorch version with CUDA 11.8
!pip install torch --index-url https://download.pytorch.org/whl/cu118
!pip install transformers==4.28.1
!sudo apt-get install jq
!pip install loralib
!pip install nvidia-ml-py3
!pip install datasets

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib.backends.backend_pdf import PdfPages
import time
import csv
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetPowerUsage, nvmlShutdown
import os
import subprocess
import torch
from pynvml import *
from threading import Thread
import re
import json
import numpy as np
from pathlib import Path
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, AutoModel
from datasets import load_dataset
import torch.nn.utils.prune as prune
from pathlib import Path
import matplotlib.pyplot as plt
import random
import matplotlib.colors as mcolors

In [None]:
%cd MeZO/medium_models/data
!bash download_dataset.sh

In [21]:
# %cd medium_models/

!python tools/generate_k_shot_data.py  --mode k-shot-1k-test --k 512 --task SST-2 --seed 1

K = 512
Seed = 1
| Task = SST-2


In [18]:
import os
import random

def split_train_tsv(file_path, num_parts=4, seed=42):
    random.seed(seed)

    with open(file_path, "r") as f:
        lines = f.readlines()

    header = lines[0]
    data_lines = lines[1:]

    # Group by label
    label_dict = {}
    for line in data_lines:
        label = line.strip().split('\t')[-1]
        if label not in label_dict:
            label_dict[label] = []
        label_dict[label].append(line)

    # Ensure we have only two labels
    assert len(label_dict) == 2, f"Expected binary classification, found labels: {list(label_dict.keys())}"

    # Shuffle each label group
    for label in label_dict:
        random.shuffle(label_dict[label])

    # Calculate min samples per label
    min_len = min(len(label_dict[label]) for label in label_dict)
    part_len = min_len // num_parts

    for i in range(num_parts):
        part_lines = []
        for label in sorted(label_dict.keys()):
            start_idx = i * part_len
            end_idx = (i + 1) * part_len
            part_lines.extend(label_dict[label][start_idx:end_idx])

        # Shuffle mixed part lines before saving
        random.shuffle(part_lines)

        part_path = file_path.replace("train.tsv", f"train_part{i+1}.tsv")
        with open(part_path, "w") as out:
            out.write(header)
            for line in part_lines:
                out.write(line)

        print(f"Saved: {part_path}")

# Example usage:
# Replace this path with your actual generated file
split_train_tsv("data/k-shot-1k-test/SST-2/512-1/train.tsv")


Saved: data/k-shot-1k-test/SST-2/512-1/train_part1.tsv
Saved: data/k-shot-1k-test/SST-2/512-1/train_part2.tsv
Saved: data/k-shot-1k-test/SST-2/512-1/train_part3.tsv
Saved: data/k-shot-1k-test/SST-2/512-1/train_part4.tsv


In [20]:
# !rm data/k-shot-1k-test/SST-2/128-42/train.tsv
!mv data/k-shot-1k-test/SST-2/128-1/train.tsv data/k-shot-1k-test/SST-2/128-1/train_part1.tsv
!mv data/k-shot-1k-test/SST-2/128-1/train_part2.tsv data/k-shot-1k-test/SST-2/128-1/train.tsv

In [None]:
import os
import glob

# Specify the directory where the cached files are located
directory = "data/k-shot-1k-test/SST-2/512-1/"

# Use glob to find all files starting with "cached"
cached_files = glob.glob(os.path.join(directory, "cached*"))

# Delete them
for file_path in cached_files:
    try:
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")
        
!WANDB_MODE=offline TASK=SST-2 K=128 SEED=1 BS=64 LR=1e-6 STEP=10 EVAL_STEP=10 MODEL=roberta-large bash mezo.sh

In [None]:
def main():
    import gc
    import glob
    num_rounds = 6
    clients = 1

    K=16
    SEED=42
    STEP=10
    EVAL_STEP=10

    parent_folder = "output_model"
    global_model_output_dir = "saved_model"
    """
    #Load RoBERTa-large model
    tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
    model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=2)
    model.save_pretrained(global_model_output_dir)
    tokenizer.save_pretrained(global_model_output_dir)
    """
    # for client in range (1, clients+1):
    #     command = (f"python tools/generate_k_shot_data.py  --mode k-shot-1k-test --k 128 --task SST-2 --seed {client}")
    #     get_ipython().system(command)
    
    for round_num in range(1, num_rounds + 1):

      print(f"Starting Round {round_num}")
      averaged_state_dict = {}
      """
      model = RobertaForSequenceClassification.from_pretrained(global_model_output_dir)
      tokenizer = RobertaTokenizer.from_pretrained(global_model_output_dir)
      print(f"Parameters before pruning: {count_parameters(model)}")
      model = prune_model(model, 0.5)
      print(f"Active parameters after pruning: {count_active_parameters(model)}")
      model.save_pretrained(global_model_output_dir)
      tokenizer.save_pretrained(global_model_output_dir)
      """

      for client_id in range (1, clients + 1):
        # base_adr = f'round{round_num}/client{client_id}'
        # os.makedirs(base_adr, exist_ok=True)  # Creates the directories if they don't exist

        # input_file_path = f'{base_adr}/logs_K{K}_SEED{SEED}_STEP{STEP}_EVALSTEP{EVAL_STEP}.txt'
        # output_pdf_path = f'{base_adr}/plos_K{K}_SEED{SEED}_STEP{STEP}_EVALSTEP{EVAL_STEP}.pdf'
        # energy_log_mezo = f'{base_adr}/energy_log_mezo_{K}_SEED{SEED}_STEP{STEP}_EVALSTEP{EVAL_STEP}.csv'

        print(f"------------------------------------Executing command for Client {client_id} in Round {round_num}:")
        # Specify the directory where the cached files are located
        directory = "data/k-shot-1k-test/SST-2/512-1/"
        
        # Use glob to find all files starting with "cached"
        cached_files = glob.glob(os.path.join(directory, "cached*"))
        
        # Delete them
        for file_path in cached_files:
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Error deleting {file_path}: {e}")

        command = (f"mv data/k-shot-1k-test/SST-2/512-1/train_part{client_id}.tsv data/k-shot-1k-test/SST-2/512-1/train.tsv")
        get_ipython().system(command)

        if round_num == 1:
          print("we are here")
          command = (f"WANDB_MODE=offline TASK=SST-2 K=512 SEED=1 BS=64 LR=1e-6 STEP=10 EVAL_STEP=10 MODEL=roberta-large bash mezo.sh")
          get_ipython().system(command)
          # measure_function_energy(mezo, args=("roberta-large", input_file_path, client_id, K, SEED), interval=1, output_file=energy_log_mezo, banner="for mezo")
        else:
          command = (f"WANDB_MODE=offline TASK=SST-2 K=512 SEED=1 BS=64 LR=1e-6 STEP=10 EVAL_STEP=10 MODEL={global_model_output_dir} bash mezo.sh")
          get_ipython().system(command)
        command = (f"mv data/k-shot-1k-test/SST-2/512-1/train.tsv data/k-shot-1k-test/SST-2/512-1/train_part{client_id}.tsv")
        get_ipython().system(command)
          # measure_function_energy(mezo, args=(global_model_output_dir, input_file_path, client_id, K, SEED), interval=1, output_file=energy_log_mezo, banner="for mezo")

        model = RobertaForSequenceClassification.from_pretrained(f"{parent_folder}")
        tokenizer = RobertaTokenizer.from_pretrained(f"{parent_folder}")
        state_dict = model.state_dict()

        print("create average dic")
        if not averaged_state_dict:
          for key in state_dict.keys():
                # print("you should see only one time")
                tensor = state_dict[key]
                if tensor.is_floating_point():
                    averaged_state_dict[key] = torch.zeros_like(tensor)
                else:
                    averaged_state_dict[key] = tensor.clone()

        print("summation")
        for key in state_dict.keys():
              try:
                  tensor = state_dict[key]
                  #print(f"Key: {key}, Tensor Type: {tensor.dtype}, Tensor Shape: {tensor.shape}")
                  if tensor.is_floating_point():
                      averaged_state_dict[key] += tensor
                  else:
                      averaged_state_dict[key] = tensor.clone()
              except Exception as e:
                  print(f"Error processing key {key}: {str(e)}")
                  raise e
        command = (f"rm -rf output_model")
        get_ipython().system(command)

      print ("go for averaging")
      # Average the parameters
      averaged_state_dict[key] /= clients
      global_model = RobertaForSequenceClassification.from_pretrained('roberta-large')
      global_model.load_state_dict(averaged_state_dict)
      print("update")
      global_model.save_pretrained(global_model_output_dir)
      tokenizer.save_pretrained(global_model_output_dir)
      print("done saving in this round")


    print("All rounds completed.")
    print("Generate Plots")

if __name__ == "__main__":
    main()

In [4]:
%cd MeZO/medium_models/

/home/javad/MeZO/medium_models
