In [None]:
!pip install unsloth
!pip install datasets
from datasets import load_dataset, Dataset
import pandas as pd
import re
from unsloth import FastLanguageModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Dict, Optional
import wandb
from dataclasses import dataclass
import random
import time
import pickle
import requests
## RL specific
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)


# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Configure quantization for inference
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load base model with quantization
base_model_name = "unsloth/mistral-7b-v0.3-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

# Load LoRA adapter weights
adapter_path = "/content/drive/MyDrive/models/ai_safety/finetuned_model_0131/mistral7b_finetuned/checkpoints/checkpoint-505"
model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Optional: Merge LoRA weights with base model for faster inference
# model = model.merge_and_unload()

# Set eval mode
model.eval()



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096, padding_idx=770)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
   

In [None]:
# Importing red_teaming_pipeline
!git clone https://github.com/sofasogood/red_teaming_pipeline.git
%cd red_teaming_pipeline
!pip install -e .
!pip install -r requirements.txt

In [8]:
from dataset_creation.data_processing_functions.data_creation import extract_question
from dataset_creation.data_processing_functions.data_creation_rl import build_instruct_prompt, generate_response,AIJudge, JudgementChoice, JudgementError




In [5]:
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/completions"
OPENROUTER_API_KEY = "insert your key here"

In [6]:
#Load dataset
dataset_red_team = load_dataset("Anthropic/hh-rlhf", data_dir = "red-team-attempts")
dataset_red_team_train = dataset_red_team["train"].to_pandas()
dataset_red_team_train["question"] = dataset_red_team_train["transcript"].apply(extract_question)
dataset_red_team_train_subset = dataset_red_team_train[dataset_red_team_train["task_descripton_harmlessness_score"]<0.8].sort_values(by = "task_descripton_harmlessness_score")

In [13]:
ai_feedback_data = []  #store pairs + preference


In [11]:
import pandas as pd
def save_to_csv():
  DIR = "/content/drive/MyDrive/AI_SAFETY/"
  # Convert to DataFrame
  df_llm_judge_2 = pd.DataFrame(ai_feedback_data)
  start_idx = 0
  end_idx = 0
  if ai_feedback_data:
    start_idx = ai_feedback_data[0]["idx"]
    end_idx = ai_feedback_data[-1]["idx"]
  df_llm_judge_2.to_csv(f"{DIR}df_llm_judge_{start_idx}_{end_idx}.csv")

save_to_csv()

In [None]:
harmful_prompts = [x for x in dataset_red_team_train["question"]]

for idx, prompt in enumerate(harmful_prompts):
    already_done = [entry["idx"] for entry in ai_feedback_data]
    if idx not in already_done:
      start_time_all = time.time()
      print(f"Prompt {idx}: {prompt}", flush=True)
      print("---", flush=True)
      start_time_a = time.time()
      responseA = generate_response(prompt=prompt, model=model, tokenizer=tokenizer, use_local_model=True)
      time_taken_a = time.time() - start_time_a
      print(f"  Response A >> {responseA}", flush=True)
      print(f"    Time taken: {time_taken_a} seconds", flush=True)
      print("---", flush=True)
      start_time_b = time.time()
      responseB = generate_response(prompt=prompt, model=model, tokenizer=tokenizer, use_local_model=True)
      time_taken_b = time.time() - start_time_b
      print(f"  Response B >> {responseB}", flush=True)
      print(f"    Time taken: {time_taken_b} seconds", flush=True)
      print("---", flush=True)

      start_time_winner = time.time()
      judge = AIJudge(api_key=OPENROUTER_API_KEY)
      try:
        winner = judge.judge(prompt, responseA, responseB)
      except Exception as e:
        print(f"Error in ai_judge: {str(e)}")
        winner = JudgementChoice.INVALID
      end_time_winner = time.time()
      print(f"Winner: {winner}", flush=True)
      print(f"    Time taken: {end_time_winner - start_time_winner} seconds", flush=True)
      end_time = time.time()
      print(f"Total time taken: {end_time - start_time_all} seconds", flush=True)
      print("========", flush=True)

      if idx not in ai_feedback_data:
        ai_feedback_data.append({
            "idx":idx,
            "prompt": prompt,
            "responseA": responseA,
            "responseB": responseB,
            "winner": winner
        })
      else:
        continue

      if idx % 50 == 0:
        save_to_csv()
