In [None]:
!pip install -q transformers datasets torch

In [None]:
import os
import json
import random
import torch
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset

# Configuration
MODEL_DIR = "results/LAURA_1"  # Update with your specific directory
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
MAX_NEW_TOKENS = 200
FOLDER_PATH = '/workspace/slice-monorepo/cl_cr3/aligneddata'
TARGET_NAME = "LAURA"
NUM_EXAMPLES = 5

# Load the trained model and tokenizer
model = GPTNeoXForCausalLM.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = model.to(DEVICE)
model.eval()

# Function to load real examples from the aligneddata dataset
def load_real_examples_from_aligneddata(folder_path, target_name, num_examples):
    utterances = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path) as json_file:
                    data = json.load(json_file)
                    for document in data:
                        turns = document['TURNS']
                        for i in range(1, len(turns)):
                            prev_turn = turns[i-1]
                            curr_turn = turns[i]
                            if target_name in curr_turn['NAMES']:
                                instruction = f"{prev_turn['NAMES'][0]}: " + " ".join(prev_turn['UTTERANCES'])
                                response = f"{curr_turn['NAMES'][0]}: " + " ".join(curr_turn['UTTERANCES'])
                                utterances.append({"instruction": instruction, "expected_response": response})
                                if len(utterances) >= num_examples:
                                    return utterances
    return utterances

# Function to preprocess and load real examples from the dolly15k dataset
def load_real_examples_from_dolly15k(num_examples):
    dataset = load_dataset("databricks/databricks-dolly-15k")
    test_dataset = dataset['train'].train_test_split(test_size=0.1)['test']
    
    def preprocess_function(examples):
        instruction = examples['instruction']
        context = examples.get('context', "")
        response = examples['response']

        if isinstance(instruction, list):
            instruction = [" ".join(ins) if isinstance(ins, list) else ins for ins in instruction]
        if isinstance(context, list):
            context = [" ".join(con) if isinstance(con, list) else con for con in context]

        return {"instruction": instruction, "context": context, "response": response}
    
    test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)
    
    examples = [{"instruction": f"{row['instruction']} {row['context']}".strip(), "expected_response": row['response']} for row in test_dataset]
    
    return random.sample(examples, num_examples)

# Load real examples from both datasets
real_examples_aligneddata = load_real_examples_from_aligneddata(FOLDER_PATH, TARGET_NAME, NUM_EXAMPLES)
real_examples_dolly15k = load_real_examples_from_dolly15k(NUM_EXAMPLES)

# Combine real examples from both datasets
real_examples = real_examples_aligneddata + real_examples_dolly15k

# Function to generate a response from the model
def generate_response(prompt, max_new_tokens=MAX_NEW_TOKENS):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(DEVICE)
    attention_mask = torch.ones(input_ids.shape, device=DEVICE)

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, eos_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    return response

# Perform inference and print the results
print("\nInference on real examples from both datasets:")
for example in real_examples:
    response = generate_response(example["instruction"])
    model_response = response.replace(example["instruction"], "").strip()
    print(f"Instruction: {example['instruction']}")
    print(f"Expected Response: {example['expected_response']}")
    print(f"Model Response: {model_response}\n")

    # Save test results to file
    with open(os.path.join(MODEL_DIR, "test_results.txt"), "a") as file:
        file.write(f"Instruction: {example['instruction']}\n")
        file.write(f"Expected Response: {example['expected_response']}\n")
        file.write(f"Model Response: {model_response}\n\n")


In [None]:
import torch
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
import os

# Directory where the model and tokenizer are saved
directory_name = "results/EleutherAI-pythia-70m_4"  # Update with your specific directory

# Load model and tokenizer
model = GPTNeoXForCausalLM.from_pretrained(directory_name)
tokenizer = AutoTokenizer.from_pretrained(directory_name)

# Load dataset
dataset = load_dataset("databricks/databricks-dolly-15k")
test_dataset = dataset['train'].train_test_split(test_size=0.1)['test']

def preprocess_function(examples):
    instruction = examples['instruction']
    context = examples.get('context', "")
    response = examples['response']

    if isinstance(instruction, list):
        instruction = [" ".join(ins) if isinstance(ins, list) else ins for ins in instruction]
    if isinstance(context, list):
        context = [" ".join(con) if isinstance(con, list) else con for con in context]
    
    text = [f"{ins} {con}".strip() for ins, con in zip(instruction, context)]
    labels = tokenizer(response, truncation=True, padding='max_length', max_length=1000).input_ids

    tokenized = tokenizer(text, truncation=True, padding='max_length', max_length=1000)
    tokenized['labels'] = labels
    return tokenized

test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

# Perform inference on 5 random samples from the test set
model.to('cuda:0')
model.eval()

print("\nFinal Test on 5 random samples from the test set:")
for i, batch in enumerate(test_dataloader):
    if i >= 5:
        break
    input_ids = batch['input_ids'].to('cuda:0')
    attention_mask = batch['attention_mask'].to('cuda:0')
    labels = batch['labels'].to('cuda:0')

    # Adjust max_new_tokens to ensure generation
    output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=200, eos_token_id=tokenizer.eos_token_id)

    decoded_input = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Remove the input part from the output
    model_response = decoded_output[len(decoded_input):].strip()
    
    expected_output = tokenizer.decode(labels[0], skip_special_tokens=True)

    print(f"Input: {decoded_input}")
    print(f"Expected Output: {expected_output}")
    print(f"Model Output: {model_response}\n")

    # Save test results to file
    with open(os.path.join(directory_name, "test_results.txt"), "a") as file:
        file.write(f"Input: {decoded_input}\n")
        file.write(f"Expected Output: {expected_output}\n")
        file.write(f"Model Output: {model_response}\n\n")


In [None]:
import os
import json
import random
import torch
from transformers import GPTNeoXForCausalLM, AutoTokenizer

# Configuration
MODEL_DIR = "/workspace/slice-monorepo/cl_cr3/llms/gpt/results/LAURA_1"
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
MAX_NEW_TOKENS = 50
FOLDER_PATH = '/workspace/slice-monorepo/cl_cr3/aligneddata'
TARGET_NAME = "LAURA"
NUM_EXAMPLES = 5

# Function to load real examples from the dataset
def load_real_examples(folder_path, target_name, num_examples):
    utterances = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path) as json_file:
                    data = json.load(json_file)
                    for document in data:
                        turns = document['TURNS']
                        for i in range(1, len(turns)):
                            prev_turn = turns[i-1]
                            curr_turn = turns[i]
                            if target_name in curr_turn['NAMES']:
                                instruction = f"{prev_turn['NAMES'][0]}: " + " ".join(prev_turn['UTTERANCES'])
                                response = f"{curr_turn['NAMES'][0]}: " + " ".join(curr_turn['UTTERANCES'])
                                utterances.append({"instruction": instruction, "expected_response": response})
                                if len(utterances) >= num_examples:
                                    return utterances
    return utterances

# Load the trained model and tokenizer
model = GPTNeoXForCausalLM.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = model.to(DEVICE)
model.eval()

# Function to generate a response from the model
def generate_response(prompt, max_new_tokens=MAX_NEW_TOKENS):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(DEVICE)
    attention_mask = torch.ones(input_ids.shape, device=DEVICE)

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, eos_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    return response

# Load real examples from the dataset
real_examples = load_real_examples(FOLDER_PATH, TARGET_NAME, NUM_EXAMPLES)

# Perform inference and print the results
for example in real_examples:
    response = generate_response(example["instruction"])
    model_response = response.replace(example["instruction"], "").strip()
    print(f"Instruction: {example['instruction']}")
    print(f"Expected Response: {example['expected_response']}")
    print(f"Model Response: {model_response}\n")
