In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json
from datasets import Dataset, DatasetDict
from tqdm.notebook import tqdm
import boto3
from botocore.exceptions import ClientError
import torch

In [None]:
language = 'waima'
access_token = "xxx"
dataset_label = "test"

In [None]:
torch.cuda.empty_cache()

In [None]:
# Load the trained model for inference
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# load model with no finetuning
#tokenizer = AutoTokenizer.from_pretrained('/root/merged_adapters', token=access_token)
#model = AutoModelForCausalLM.from_pretrained('/root/merged_adapters', token=access_token, quantization_config=bnb_config, device_map={"":0})

tokenizer = AutoTokenizer.from_pretrained('/root/trained_model_'+language, token=access_token)
#special_tokens = {'additional_special_tokens': ['khm_Khmr', 'cja_Othr']}
#tokenizer.add_special_tokens(special_tokens)
model = AutoModelForCausalLM.from_pretrained('/root/trained_model_'+language, token=access_token, device_map={"":0}) #quantization_config=bnb_config,

In [None]:
# read in the data

# Initialize a dictionary to hold the lists for each field
read_data = {'model_inputs': [], 'completion': []}

# Open the file and read line by line
with open('/root/all_llm_data/'+language+'_' + dataset_label + '_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        read_data['model_inputs'].append(json_object.get('model_inputs', ''))  
        read_data['completion'].append(json_object.get('completion', ''))         

# Convert the dictionary of lists into a `Dataset`
dataset = Dataset.from_dict(read_data)

# Create a `DatasetDict` for a train/test split
dataset_dict = DatasetDict({'data': dataset})

# Print the first entry of the dataset to check its structure
print(dataset_dict['data'][0])

In [None]:
# Restrain output to only a certain character set (in this case Latin script)
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, LogitsProcessorList, LogitsProcessor

class LatinScriptLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer):
        self.allowed_tokens = set(tokenizer.convert_tokens_to_ids(tokenizer.tokenize('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "-:;().,?!')))

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        # Get the batch size and vocab size
        batch_size, vocab_size = scores.size()
        
        # Create a mask for allowed tokens
        allowed_mask = torch.zeros(vocab_size, dtype=torch.bool)
        allowed_mask[list(self.allowed_tokens)] = True
        
        # Apply the mask to the scores
        scores[:, ~allowed_mask] = -float("inf")
        
        return scores

# Create the logits processor
logits_processor = LatinScriptLogitsProcessor(tokenizer)
logits_processor_list = LogitsProcessorList([logits_processor])

In [None]:
# Open the file and read line by line
train = {'model_inputs': [], 'completion': []}

with open('/root/all_llm_data/'+language+'_train_data.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        # Each line is a complete JSON object
        json_object = json.loads(line)
        # Append each field to the appropriate list
        train['model_inputs'].append(json_object.get('model_inputs', ''))  
        train['completion'].append(json_object.get('completion', '')) 

In [None]:
#Constrain generation to only use tokens seen in the training set (completions only)

# Step 1: Extract valid tokens from the training data
valid_tokens = set()
for example in train['completion']:
    tokens = tokenizer.tokenize(example)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    valid_tokens.update(token_ids)
    break

In [None]:
# Step 2: Define a custom logits processor
class ValidTokensLogitsProcessor(LogitsProcessor):
    def __init__(self, valid_tokens):
        self.valid_tokens = valid_tokens

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        # Get the batch size and vocab size
        batch_size, vocab_size = scores.size()
        
        # Create a mask for valid tokens
        valid_mask = torch.zeros(vocab_size, dtype=torch.bool)
        valid_mask[list(self.valid_tokens)] = True
        
        # Apply the mask to the scores
        scores[:, ~valid_mask] = -float("inf")
        
        return scores

# Create the logits processor
valid_tokens_logits_processor = ValidTokensLogitsProcessor(valid_tokens)
valid_tokens_logits_processor_list = LogitsProcessorList([valid_tokens_logits_processor])

In [None]:
# diversity_penalty needs to be a float (not int) - e.g., 1.0 rather than 1
def batch_generate(max_new_tokens=200,temperature=0.5,top_p=0.9,top_k=50,repetition_penalty=1,renormalize_logits=False,logits_processor=None,
    num_beams=1,do_sample=True,penalty_alpha=0,no_repeat_ngram_size=0,diversity_penalty=0.0,num_beam_groups=1):
    # generate in batches (to avoid running out of memory)
    generated = []
    NUM_INPUTS = 10

    starting_num = 0
    while starting_num < len(dataset_dict['data']):
        torch.cuda.empty_cache()
        print(starting_num)

        # tokenize all inputs for validation data
        tokenizer.pad_token = tokenizer.eos_token
        ending_num = min(starting_num + NUM_INPUTS, len(dataset_dict['data']))
        inputs = tokenizer([i['model_inputs'] for i in dataset_dict['data']][starting_num:ending_num], padding = True, return_tensors="pt").to("cuda")
        starting_num += NUM_INPUTS

        assert len(inputs['input_ids']) <= NUM_INPUTS

        # send all tokenized input through model
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            do_sample=do_sample,
            repetition_penalty=repetition_penalty,
            renormalize_logits=renormalize_logits,
            logits_processor=logits_processor,
            num_beams=num_beams,
            penalty_alpha=penalty_alpha,
            no_repeat_ngram_size=no_repeat_ngram_size,
            diversity_penalty=diversity_penalty,
            num_beam_groups=num_beam_groups
        )

        generated += tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return generated

In [None]:
%pip install sentencepiece==0.1.97 # same as silnlp
%pip install nltk==3.7 # same as silnlp
%pip install sacrebleu==2.3.1 # same as silnlp

In [None]:
import sacrebleu

In [None]:
generated = batch_generate(max_new_tokens=150,repetition_penalty=1.2,renormalize_logits=True,do_sample=False,top_k=4,penalty_alpha=0.6)
#generated = batch_generate(max_new_tokens=150,temperature=0.5,repetition_penalty=1.2,renormalize_logits=False,logits_processor=valid_tokens_logits_processor_list,\
#    num_beams=1,top_k=4,penalty_alpha=0.6,do_sample=False,no_repeat_ngram_size=4,diversity_penalty=0,num_beam_groups=1)

In [None]:
with open("/root/all_llm_data/" + language + "_" + dataset_label + ".trg.detok.txt", "r", encoding="utf-8") as file:
    trg = file.readlines()
with open("/root/all_llm_data/" + language + "_" + dataset_label + ".src.detok.txt", "r", encoding="utf-8") as file:
    src = file.readlines() 

In [None]:
generated[:10]

In [None]:
# automatically remove prompt from results
generated = [i[i.find(":")+2:] for i in generated] #32

for i in range(len(generated)):
  source_sentence = src[i][:-1] #remove new line character from the end of the source sentence
  prediction = generated[i]
  
  if prediction[:len(source_sentence)]==source_sentence: #prediction repeats the source sentence
    generated[i] = prediction[len(source_sentence)+1:] #remove source sentence from prediction

In [None]:
generated[:10]

In [None]:
len(generated)

In [None]:
pair_sys = generated
pair_refs = [trg]
bleu_score = sacrebleu.corpus_bleu(pair_sys,pair_refs,lowercase=True,tokenize="13a")
print(bleu_score.score)

In [None]:
trg[7]

In [None]:
# save predictions to a file
generated = [i.replace("\n"," ") for i in generated] #for each prompt, make everything on one line

with open('/root/model_' + language + '_' + dataset_label + '.trg-predictions.detok.txt', 'w') as file:
    file.write('\n'.join(generated))

In [None]:
# set up s3 bucket

# fill in access key id and secret access key
s3 = boto3.client('s3',
    aws_access_key_id="xxx",
    aws_secret_access_key="xxx",
)

def upload_file(file_name, bucket, object_name):
    if object_name is None:
        object_name = file_name
    try:
        s3.upload_file(file_name, bucket, Key=object_name)
    except ClientError as e:
        print(e)
        return False
    print("Success!")
    return True

In [None]:
# upload predictions to s3 bucket
upload_file('/root/model_' + language + '_' + dataset_label + '.trg-predictions.detok.txt', "aqua-ml-data", \
      "MT/experiments/Demo_Laura/preprocessed/" + language + '/model_' + dataset_label + '.trg-predictions.detok.txt')

In [None]:
files = ['tokenizer_config.json',
 'special_tokens_map.json',
 'tokenizer.model',
 'tokenizer.json',
 'README.md',
 'adapter_model.safetensors',
 'adapter_config.json']

for file in files:
   print(file)
   upload_file("/root/trained_model_" + language + "/" + file, "aqua-ml-data", \
      "MT/experiments/Demo_Laura/trained_models/trained_model_" + language + \
      "/" + file)