In [2]:
import numpy as np
import glob
import re
import random
import string
import torch
import openai

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize

from utils import dump_to_jsonlines, query_llm, query_opt, query_flan_t5
from transformers import pipeline, set_seed, T5Tokenizer, T5ForConditionalGeneration
import pickle
from transformers import AutoModelForCausalLM, AutoTokenizer

2023-12-12 19:08:04.315079: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-12 19:08:14.028210: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-12 19:08:14.028245: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-12 19:08:14.707236: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-12 19:08:16.026894: I tensorflow/core/platform/cpu_feature_guar

In [84]:
# Model to test.
TEST_MODELS = ['opt-1.3b','optimized-opt-2.7b']

NUM_SAMPLES = 1
PROMPT_LENGTH = 64
CHECK_LENGTH = 512

In [1]:
def remove_page_lines(text):
    # Split the text into lines
    lines = text.split('\n')

    # Use a list comprehension to keep only lines that don't start with "Page |  " followed by a number
    # Note that regex pattern '^Page \|  \d+' means "start of line followed by 'Page |  ' and one or more digits"
    lines = [line for line in lines if not re.match(r'^Page | d+', line)]
    
    # Join the lines back together and return
    return '\n'.join(lines)

def text_similarity(model, text1, text2):
    # Transform the sentences into embeddings
    sentence_embeddings = model.encode([text1, text2])

    # Compute the cosine similarity between the embeddings
    csim = cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])
    
    return csim[0][0]


In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"


opt_generator = pipeline('text-generation', model="facebook/opt-1.3b",
                             do_sample=False, max_length=200, device=device)
    
with open('dataset_dict.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [74]:
model_path = '/scratch/aa10350/llm-finetune/llm-finetune-copyright/models/finetune_2.7b'

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained('facebook/opt-2.7b')

# Set the device for the model (e.g., 'cuda' for GPU or 'cpu' for CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Create a text generation pipeline
opt_finetuned_generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device, max_new_tokens=256)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 274
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 31
    })
})

In [9]:
# Access the 'train' split
train_dataset = dataset["train"]

# Access the 'input_ids' for the 'train' split
train_input_ids = train_dataset["input_ids"]

# Output file path
output_file_path = "train_text.txt"

In [26]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b")

In [27]:
with open(output_file_path, "w", encoding="utf-8") as outfile:
    # Iterate over all examples in the 'train' split
    for input_ids in train_input_ids:
        # Convert input_ids to text using the tokenizer's decode method
        text = tokenizer.decode(input_ids, skip_special_tokens=True)

        # Write the text to the file on a new line
        outfile.write(f"{text}\n")

print(f"Text has been written to {output_file_path}")

Text has been written to train_text.txt


In [28]:
data = open('train_text.txt', 'r').read()

In [29]:
sentences = sent_tokenize(data)

In [82]:
sim_dict = {}
saved_data = []
for i in range(NUM_SAMPLES):
    sample_start_idx = np.random.randint(len(sentences))
    # Use the next 50 sentences as a testing block.
    block = sentences[sample_start_idx:sample_start_idx+50]
    block = ' '.join(block)
    block = block.replace('\n', '')

    # Use the first 350 chars as the prompt.
    prompt = block[:PROMPT_LENGTH]
    for test_model in TEST_MODELS:
        # Important: set temperature to 0.
        #TODO is the temperature set to zero for the other models as well? like OPT1.3 etc?
        if test_model == 'opt-1.3b':
            response = query_opt(prompt, opt_generator, greedy_sampling=True)
        else:
            response = query_opt(prompt, opt_finetuned_generator, greedy_sampling=True)

        # Check next 50 chars.
        response_text = response[:CHECK_LENGTH]
        gt_text = block[PROMPT_LENGTH:PROMPT_LENGTH+CHECK_LENGTH]
        bert = SentenceTransformer('all-MiniLM-L6-v2')
        sim = text_similarity(bert, response_text, gt_text)
        saved_data.append({'prompt':prompt,
                           'response':response_text,
                           'ground-truth':gt_text,
                           'similarity':str(sim),
                           'source_model': test_model})
        if test_model not in sim_dict:
            sim_dict[test_model] = []
        sim_dict[test_model].append(sim)

# Check similarity and save data.
for test in sim_dict:
    print(f'test model: {test}, similarity: {np.mean(sim_dict[test]):.2f}')



test model: opt-1.3b, similarity: 0.11
test model: optimized-opt-1.3b, similarity: 0.34


In [83]:
saved_data

[{'prompt': '‘North- ward to take a straighter road to Isengard, or Fangorn, ',
  'response': '                                                                                                                                                                                   ',
  'ground-truth': 'if that is their aim as you guess? Or southward to strike the Entwash?’ ‘They will not make for the river, whatever mark they aim at,’ said Aragorn. ‘And unless there is much amiss in Rohan and the power of Saruman is greatly increased, they will take the shortest way that they can find over the fields of the Rohirrim. Let us search northwards!’ The dale ran like a stony trough between the ridged hills, and a trickling stream flowed among the boulders at the bottom. A cliff frowned upon their right; to thei',
  'similarity': '0.1089969',
  'source_model': 'opt-1.3b'},
 {'prompt': '‘North- ward to take a straighter road to Isengard, or Fangorn, ',
  'response': ' or wherever we are going.’ ‘But w

In [None]:
dump_to_jsonlines(saved_dat, 'copyright.jsonl')