In [1]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import re

# Replace these with your actual model names or paths
base_model_name = 'meta-llama/Meta-Llama-3-8B'      # e.g., 'gpt2'
lora_model_name = 'fine-tuned-llama-lora'          # Path to your LoRA model
jsonl_file_path = 'qa.train.synthetic.jsonl'       # Path to your input JSONL file

# Configure 8-bit loading
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,    # Adjust as needed
    llm_int8_has_fp16_weight=False,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load the base model in 8-bit with device map for offloading
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map='auto',  # Automatically offloads layers to CPU/GPU
)

# Load the LoRA model on top of the base model
model = PeftModel.from_pretrained(model, lora_model_name)
model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj)

In [2]:
def refine_output(context, question, initial_output):
    # Combine context, question, and initial output for the prompt
    refinement_prompt = (
        f"Context: {context}\n"
        f"Question: {question}\n"
        f"Initial Answer: {initial_output}\n\n"
        "Please refine the initial answer to better answer the question given the context. If the answer is incorrect then correct it. Only output the refined answer. \nRefined answer:"
    )

    # Prepare the input for the model
    inputs = tokenizer(refinement_prompt, return_tensors='pt', truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate the refined output
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id 
        )

    # Decode the output
    refined_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return refined_text


In [3]:
import json
import re
from tqdm import tqdm

def process_jsonl_file(jsonl_file_path, output_jsonl_path):
    refined_data = []
    
    with open(jsonl_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in tqdm(lines, desc="Processing JSONL lines"):
        data = json.loads(line.strip())

        # Extract only the last context and question before the "output" field
        fields = list(data.keys())
        context, question = "", ""

        # Extracting 'output' field details
        output_field = data.get('output', '')

        # Find the context and question in the input field
        input_field = data.get('input', '')
        matches = re.findall(r"Context:(.*?)\nQuestion:(.*?)\n", input_field, re.DOTALL)

        # Extract the last context and question if matches found
        if matches:
            context, question = matches[-1]
            context = context.strip()
            question = question.strip()

        # Refine the output
        refined_output = refine_output(context.strip(), question.strip(), output_field.strip())

        # Save the refined data
        refined_data.append({
            "context": context,
            "question": question,
            "answer": refined_output
        })

    # Write the refined data to a new JSONL file
    with open(output_jsonl_path, 'w', encoding='utf-8') as out_f:
        for item in refined_data:
            out_f.write(json.dumps(item) + '\n')
# Note: Replace 'jsonl_file_path' with the path to your actual JSONL file when running this function.
# Example usage: process_jsonl_file('qa.train.synthetic.jsonl')
process_jsonl_file(jsonl_file_path, "refinedssr.jsonl")


Processing JSONL lines: 100%|██████████| 2000/2000 [4:21:16<00:00,  7.84s/it]  


In [1]:
import json
import re
from tqdm import tqdm

def process_jsonl_file(jsonl_file_path, output_jsonl_path):
    refined_data_format= []
    
    # Read input JSONL file
    with open(jsonl_file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for line in tqdm(lines, desc="Processing JSONL lines"):
        data = json.loads(line.strip())

        # Extract the "context", "question", and "answer" fields
        context = data.get("context", "").strip()
        question = data.get("question", "").strip()
        answer = data.get("answer", "").strip()

        # Extract the refined answer from the "answer" field
        match = re.search(r"Refined answer: (.*)", answer, re.DOTALL)
        refined_answer = match.group(1).strip() if match else ""

        # Save the refined data
        refined_data_format.append({
            "context": context,
            "question": question,
            "refined_answer": refined_answer
        })

    # Write the refined data to a new JSONL file
    with open(output_jsonl_path, 'w', encoding='utf-8') as out_f:
        for item in refined_data_format:
            out_f.write(json.dumps(item) + '\n')

# Replace these paths with the actual file paths
input_jsonl_path = "refinedssr.jsonl"  # Input JSONL path
output_jsonl_path = "refined_answers.jsonl"    # Output JSONL path

# Process the file
process_jsonl_file(input_jsonl_path, output_jsonl_path)


Processing JSONL lines: 100%|██████████| 2000/2000 [00:00<00:00, 160926.35it/s]


In [1]:
import jsonlines
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import os
from datasets import utils
from datasets import load_dataset


def generate_embeddings(data, model_name='all-MiniLM-L6-v2', batch_size=64):
    # Load the sentence transformer model
    model = SentenceTransformer(model_name)
    
    # Combine context and question for embeddings
    texts = [f"{task['context']} {task['question']}" for task in data]
    
    # Generate embeddings in batches
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
        embeddings.append(batch_embeddings)
    embeddings = np.vstack(embeddings)
    
    return embeddings


def cluster_and_sample_data(data, embeddings, sample_memory=200, n_clusters=20):
    """
    Clusters data using K-means and samples from clusters proportionally.
    """
    # Normalize embeddings
    embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=-1, keepdims=True)
    
    # Perform k-means clustering
    kmeans = KMeans(n_clusters=n_clusters, n_init='auto', random_state=0)
    labels = kmeans.fit_predict(embeddings_norm)
    
    # Compute distances to cluster centers
    centric_distances = np.linalg.norm(embeddings_norm - kmeans.cluster_centers_[labels], axis=1)
    
    # Count instances in each cluster
    n_cluster_instances = np.bincount(labels, minlength=n_clusters)
    
    # Determine number of samples per cluster proportionally
    total_instances = len(data)
    clu_sample_num = [max(1, round(sample_memory * count / total_instances)) for count in n_cluster_instances]
    
    # Sample data points closest to cluster centers
    sampled_indices = []
    for clu_idx in range(n_clusters):
        cluster_indices = np.where(labels == clu_idx)[0]
        cluster_distances = centric_distances[cluster_indices]
        num_samples = min(clu_sample_num[clu_idx], len(cluster_indices))
        if num_samples > 0:
            # Get indices of closest points
            closest_indices = cluster_indices[np.argsort(cluster_distances)[:num_samples]]
            sampled_indices.extend(closest_indices)
    
    # Get sampled data
    sampled_data = [data[i] for i in sampled_indices]
    
    return sampled_data


def save_sampled_data(sampled_data, output_path):
    """
    Save sampled data to a JSONL file.
    """
    with jsonlines.open(output_path, mode="w") as writer:
        writer.write_all(sampled_data)
    print(f"Sampled data saved to {output_path}")


# Paths
refined_data_path = "refined_answers.jsonl"  # Replace with your refined synthetic JSONL path
output_sampled_path = "final_sampled.jsonl"    # Path to save the sampled output

# Parameters
sample_memory = 200  # Adjust based on your requirement
n_clusters = 20      # Number of clusters for K-means
embedding_model_name = 'all-MiniLM-L6-v2'  # Model for embeddings

# Step 1: Load refined data
with jsonlines.open(refined_data_path, mode="r") as reader:
    refined_data = [item for item in reader]

# Step 2: Generate embeddings
embeddings = generate_embeddings(refined_data, model_name='all-MiniLM-L6-v2')

# Step 3: Cluster and sample data
sampled_data = cluster_and_sample_data(refined_data, embeddings, sample_memory=sample_memory, n_clusters=n_clusters)

# Step 4: Save the sampled data
save_sampled_data(sampled_data, output_sampled_path)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 32/32 [00:01<00:00, 17.58it/s]


Sampled data saved to final_sampled.jsonl


In [5]:
!pip install --upgrade huggingface-hub transformers sentence-transformers accelerate

Collecting huggingface-hub
  Downloading huggingface_hub-0.26.3-py3-none-any.whl.metadata (13 kB)
Collecting transformers
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Using cached sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading huggingface_hub-0.26.3-py3-none-any.whl (447 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.6/447.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading transformers-4.47.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m77.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hUsing cached sentenc