In [None]:
!pip install transformers
!pip install sentencepiece
!pip install -q datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 13.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 56.4 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 39.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K 

In [None]:
import os
import time
import math
import random
import datetime
from pathlib import Path

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"  # reduce the amount of console output from TF
import tensorflow as tf

from transformers import *
from datasets import load_dataset

logging.set_verbosity_warning()
logging.set_verbosity_error()

import logging

print('TF version',tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) # check GPU available

TF version 2.9.2
Num GPUs Available:  1


In [None]:
def setup_strategy(xla, fp16, no_cuda):
    print(" Tensorflow: setting up strategy")
    
    # setup xla
    if xla:
        print(" XLA Enabled")
        tf.config.optimizer.set_jit(True)
    
    # setup mixed precision training
    if fp16:
        # Set to float16 at first
        print(" Mixed Precision Training Enabled")
        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
        tf.keras.mixed_precision.experimental.set_policy(policy)
    
    # setup distribution strategy
    gpus = tf.config.list_physical_devices("GPU")
    if no_cuda:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
    else:
        if len(gpus) == 0:
            print(" One Device Strategy [CPU] Enabled")
            strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        elif len(gpus) == 1:
            print("1 One Device Strategy [GPU] Enabled")
            strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        elif len(gpus) > 1:
            print(" Mirrored Strategy Enabled")
            # If only want to use a specific subset of GPUs use CUDA_VISIBLE_DEVICES=0`
            strategy = tf.distribute.MirroredStrategy()
        else:
            strategy = tf.distribute.get_strategy()

    return strategy

def n_replicas(strategy):
    # return number of devices
    return strategy.num_replicas_in_sync

# note: 
# huggingface TF-T5 implementation has issues when mixed precision is enabled
# we will disable FP16 for this but can be used for training any other model
strategy = setup_strategy(xla=True, fp16=False, no_cuda=False)

 Tensorflow: setting up strategy
 XLA Enabled
1 One Device Strategy [GPU] Enabled


In [None]:
def convert_examples_to_features(examples, tokenizer, args):

    texts = examples['rewritten_intent']
    codes = examples['snippet']
    for i in range(len(texts)):
        if(examples['rewritten_intent'][i] == None):
            texts[i] = examples['intent'][i]
    print("codes",codes)
    print("texts",texts)
    inputs = [args.prefix + text for text in texts]
    model_inputs = tokenizer(inputs, max_length=args.max_input_length, padding="max_length", truncation=True)
    print("model_inputs ", model_inputs)
    # encode texts by prepending the task for input sequence and appending the test sequence
    
    labels = tokenizer(codes, max_length=args.max_target_length, padding="max_length", truncation=True).input_ids
    print("labels ", labels)
    # we need to replace the index of the padding tokens by -100
    # such that they are not taken into account by the CrossEntropyLoss
    labels_with_ignore_index = []
    for labels_example in labels:

        labels_example = [label if label != 0 else -100 for label in labels_example]
        labels_with_ignore_index.append(labels_example)
    model_inputs["labels"] = labels_with_ignore_index
    print("model_inputs ", model_inputs)
    # return features
    return model_inputs


def get_train_tfdataset(train_dataset, num_train_examples, args):
    # select feature columns
    columns = ['input_ids', 'attention_mask', 'labels'] 
    # set to tensorflow format
    train_dataset.set_format(type='tensorflow', columns=columns) 
    
    # specify return types
    return_types = {'input_ids':tf.int32, 'attention_mask':tf.int32, 'labels':tf.int32} 
    # specify return shapes
    return_shapes = {'input_ids': tf.TensorShape([None]),'attention_mask': tf.TensorShape([None]), 'labels': tf.TensorShape([None])} 
    # initialize dataset 
    tf_dataset = tf.data.Dataset.from_generator(lambda : train_dataset, return_types, return_shapes) 
    print("tf_dataset", tf_dataset)
    # turn off auto-sharding
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    tf_dataset = tf_dataset.with_options(options)
    
    # repeat, shuffle, batch, prefetch
    ds = (
        tf_dataset.repeat()
        .shuffle(num_train_examples, seed=args.seed)
        .batch(args.train_batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    # distribute dataset to devices
    return strategy.experimental_distribute_dataset(ds)

def get_validation_tfdataset(eval_dataset, num_validation_examples, args):
    # select feature columns
    columns = ['input_ids', 'attention_mask', 'labels'] 
    # set to tensorflow format
    eval_dataset.set_format(type='tensorflow', columns=columns) 
    
    # specify return types
    return_types = {'input_ids':tf.int32, 'attention_mask':tf.int32, 'labels':tf.int32} 
    # specify return shapes
    return_shapes = {'input_ids': tf.TensorShape([None]),'attention_mask': tf.TensorShape([None]), 'labels': tf.TensorShape([None])} 
    # initialize dataset 
    tf_dataset = tf.data.Dataset.from_generator(lambda : eval_dataset, return_types, return_shapes) 
    
    # turn off auto-sharding
    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    tf_dataset = tf_dataset.with_options(options)
    
    # repeat, batch, prefetch
    ds = (
        tf_dataset.repeat()
        .batch(args.validation_batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )
    
    # distribute dataset to devices
    return strategy.experimental_distribute_dataset(ds)

In [None]:
def fix_all_seeds(seed):
    # set random seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
def init_logger(log_file=None, log_file_level=logging.NOTSET):
    # initialize logger for tracking events and save in file
    if isinstance(log_file, Path):
        log_file = str(log_file)
    log_format = logging.Formatter(
        fmt='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S'
    )
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(log_format)
    logger.handlers = [console_handler]
    if log_file and log_file != '':
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(log_file_level)
        # file_handler.setFormatter(log_format)
        logger.addHandler(file_handler)
    return logger

class ProgressBar(object):
    # custom progress bar
    def __init__(self, n_total,width=30,desc = 'Training'):
        self.width = width
        self.n_total = n_total
        self.start_time = time.time()
        self.desc = desc

    def __call__(self, step, info={}):
        now = time.time()
        current = step + 1
        recv_per = current / self.n_total
        bar = f'[{self.desc}] {current}/{self.n_total} ['
        if recv_per >= 1:
            recv_per = 1
        prog_width = int(self.width * recv_per)
        if prog_width > 0:
            bar += '=' * (prog_width - 1)
            if current< self.n_total:
                bar += ">"
            else:
                bar += '='
        bar += '.' * (self.width - prog_width)
        bar += ']'
        show_bar = f"\r{bar}"
        time_per_unit = (now - self.start_time) / current
        if current < self.n_total:
            eta = time_per_unit * (self.n_total - current)
            if eta > 3600:
                eta_format = ('%d:%02d:%02d' %
                              (eta // 3600, (eta % 3600) // 60, eta % 60))
            elif eta > 60:
                eta_format = '%d:%02d' % (eta // 60, eta % 60)
            else:
                eta_format = '%ds' % eta
            time_info = f' - ETA: {eta_format}'
        else:
            if time_per_unit >= 1:
                time_info = f' {time_per_unit:.1f}s/step'
            elif time_per_unit >= 1e-3:
                time_info = f' {time_per_unit * 1e3:.1f}ms/step'
            else:
                time_info = f' {time_per_unit * 1e6:.1f}us/step'

        show_bar += time_info
        if len(info) != 0:
            show_info = f'{show_bar} ' + \
                        "-".join([f' {key}: {value:.4f} ' if key != "learning_rate" else f' {key}: {value:.8f} ' for key, value in info.items()])
            print(show_info, end='')
        else:
            print(show_bar, end='')

In [None]:
class Trainer:
    def __init__(
        self, model, args, train_dataset, validation_dataset, 
        num_train_examples, num_validation_examples
    ):
        self.model = model
        self.args = args
        
        self.train_dataset = train_dataset
        self.num_train_examples = num_train_examples
        
        self.validation_dataset = validation_dataset
        self.num_validation_examples = num_validation_examples
        
        self.global_step = 0
        self.eval_loss = tf.keras.metrics.Sum()
        
    def create_optimizer_and_scheduler(self, num_training_steps):
        # creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.
        num_warmup_steps = math.ceil(num_training_steps * self.args.warmup_ratio)
        self.optimizer, self.lr_scheduler = create_optimizer(
            init_lr=self.args.learning_rate,
            num_train_steps=num_training_steps,
            num_warmup_steps=num_warmup_steps,
            weight_decay_rate=self.args.weight_decay,
            adam_epsilon=self.args.adam_epsilon
        )
    
    def evaluation_step(self, features, labels, nb_instances_in_global_batch):
        # forward pass
        outputs = self.model(input_ids=features['input_ids'], attention_mask=features['attention_mask'], labels=labels, training=False)[:2]
        loss, logits = outputs[:2]
        # loss scaling
        scaled_loss = loss / tf.cast(nb_instances_in_global_batch, dtype=loss.dtype)
        # add current batch loss
        self.eval_loss.update_state(scaled_loss)
    
    @tf.function
    def distributed_evaluation_steps(self, batch):
        features = {k: v for k, v in batch.items() if 'labels' not in k}
        labels = batch['labels']
        nb_instances = tf.reduce_sum(tf.cast(labels != -100, dtype=tf.int32))
        # strategy.run() expects args to be a list or tuple
        inputs = (features, labels, nb_instances)
        # `run` replicates the provided computation and runs with the distributed input
        strategy.run(self.evaluation_step, inputs)

    def evaluate(self):
        # calculate total validation steps
        steps = math.ceil(self.num_validation_examples / self.args.validation_batch_size)
        # reset eval loss after every epoch
        self.eval_loss.reset_states()
        logs = {}
        pbar = ProgressBar(n_total=steps, desc='Evaluating')
        # iterate over validation dataset
        for step, batch in enumerate(self.validation_dataset): 
            # distributed evaluation step
            self.distributed_evaluation_steps(batch) 
            logs["eval_loss"] = self.eval_loss.result() / (step + 1)
            pbar(step=step, info=logs)
            if step == steps - 1:
                break
        print("\n------------- validation result -----------------")
        
    def apply_gradients(self, features, labels, nb_instances_in_global_batch):
        # forward pass
        outputs = self.model(input_ids=features['input_ids'], attention_mask=features['attention_mask'], labels=labels, training=True)[:2] 
        loss, logits = outputs[:2]
        # loss scaling
        scaled_loss = loss / tf.cast(nb_instances_in_global_batch, dtype=loss.dtype) 
        # calculate gradients
        gradients = tf.gradients(scaled_loss, self.model.trainable_variables) 
        # convert gradients with nan value
        gradients = [g if g is not None else tf.zeros_like(v) for g, v in zip(gradients, self.model.trainable_variables)] 
        # optimize the model
        self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) 
        # add current batch loss
        self.train_loss.update_state(scaled_loss) 
    
    @tf.function
    def distributed_training_steps(self, batch):
        with strategy.scope():
            features = {k: v for k, v in batch.items() if 'labels' not in k}
            labels = batch['labels']
            nb_instances = tf.reduce_sum(tf.cast(labels != -100, dtype=tf.int32))
            # strategy.run() expects args to be a list or tuple
            inputs = (features, labels, nb_instances)
            # `run` replicates the provided computation and runs with the distributed input.
            strategy.run(self.apply_gradients, inputs)
    
    def train(self):
        # calculate total training steps
        num_updates_per_epoch = self.num_train_examples // args.train_batch_size 
        self.steps_per_epoch = num_updates_per_epoch
        t_total = self.steps_per_epoch * self.args.epochs
        
        with strategy.scope():
            # optimizer, and checkpoint must be created under `strategy.scope`
            # create optimizer and scheduler
            self.create_optimizer_and_scheduler(num_training_steps=t_total) 
            
            # create checkpoint manager
            folder = os.path.join(self.args.output_dir, self.args.checkpoint_dir)
            ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) 
            self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, folder, max_to_keep=1)
            iterations = self.optimizer.iterations
            
            logger.info("***** Running training *****")
            logger.info(f"  Num examples = {self.num_train_examples}")
            logger.info(f"  Num Epochs = {self.args.epochs}")
            logger.info(f"  Total train batch size (w. parallel & distributed) = {self.args.train_batch_size * n_replicas(strategy)}")
            logger.info(f"  Steps per epoch = {self.steps_per_epoch}")
            logger.info(f"  Total optimization steps = {t_total}")
            
            self.train_loss = tf.keras.metrics.Sum(name="training_loss")
            start_time = datetime.datetime.now()
            for epoch_iter in range(self.args.epochs):
                # training loop
                logger.info(f"Epoch {epoch_iter + 1}/{self.args.epochs}")
                
                pbar = ProgressBar(n_total=self.steps_per_epoch, desc='Training')
                # iterate over training dataset
                for step, batch in enumerate(self.train_dataset):    
                    # distributed training step
                    self.distributed_training_steps(batch) 
                    
                    self.global_step = iterations.numpy()
                    training_loss = self.train_loss.result() / (step + 1)
                    
                    logs = {}
                    logs["training_loss"] = training_loss.numpy()
                    logs["learning_rate"] = self.lr_scheduler(self.global_step).numpy()
                    pbar(step=step, info=logs)
                    
                    if self.global_step % self.steps_per_epoch == 0:
                        print("\n------------- train result -----------------")
                        # call to evaluation loop
                        self.evaluate()
                        # save checkpoint
                        ckpt_save_path = self.model.ckpt_manager.save()
                        logger.info(f"Saving checkpoint at {ckpt_save_path}")
                        break
                
                # reset train loss after every epoch
                self.train_loss.reset_states()
            end_time = datetime.datetime.now()
            logger.info(f"Training took: {str(end_time - start_time)}")

In [None]:
def run(args):
    logger.info(" Starting training / evaluation")
    
    logger.info(" Downloading Data Files")
    dataset_path = "./conala-train.json"
    test_dataset_path = "./conala-test.json"
    test_dataset = load_dataset('json', data_files=test_dataset_path) 
    print(test_dataset)
    tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
    test_data = test_dataset.map(convert_examples_to_features, batched=True, fn_kwargs={"tokenizer":tokenizer, "args":args})
    print("test dataset after tokenizing ", test_data)
    print("cache dir", args.cache_dir)
    print("dataset path ",dataset_path)

    logger.info(" Loading Data Files")
    dataset = load_dataset('json', data_files=dataset_path) 
    print("dataset - ",dataset)

    
    # train test split
    dataset = dataset['train'].train_test_split(0.1, shuffle=False) 
    print("dataset before converting to features", dataset)    
    logger.info(" Initializing Tokenizer")
    tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name) 
    logger.info(" Preparing Features")
    dataset = dataset.map(convert_examples_to_features, batched=True, fn_kwargs={"tokenizer":tokenizer, "args":args})
    print("dataset after tokenizing ", dataset)
    print("dataset after converting to features", dataset)
    logger.info(" Intializing training and validation dataset ")
    train_dataset = dataset['train']
    #train_dataset = dataset
    print("dataset for training ", train_dataset)
    num_train_examples = len(train_dataset)
    # create tf train dataset
    print("creating tf train dataset")
    print(num_train_examples)
    tf_train_dataset = get_train_tfdataset(train_dataset, num_train_examples, args) 
    validation_dataset = dataset['test']
    num_validation_examples = len(dataset['test'])
    # create tf validation dataset
    tf_validation_dataset = get_validation_tfdataset(validation_dataset, num_validation_examples, args) 
    
    logger.info(f' Intializing model | {args.model_type.upper()} ')
    with strategy.scope():
        # model must be created under `strategy.scope`
        model = TFT5ForConditionalGeneration.from_pretrained(args.model_name_or_path, from_pt=True)
    
    # custom training loop
    trainer = Trainer(model, args, tf_train_dataset, tf_validation_dataset, num_train_examples, num_validation_examples) 
    trainer.train()
    
    # save pretrained model and tokenizer
    logger.info(f" Saving model in {args.save_dir}")
    trainer.model.save_pretrained(args.save_dir)
    tokenizer.save_pretrained(args.save_dir)

In [None]:
dataset = load_dataset('json', data_files="./conala-train.json") 



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-dbff8756dcc223b9/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-dbff8756dcc223b9/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
class Args:
    # define training arguments
    
    # MODEL
    model_type = 't5'
    tokenizer_name = 'Salesforce/codet5-base'
    model_name_or_path = 'Salesforce/codet5-base'
    
    # DATA
    train_batch_size = 8
    validation_batch_size = 8
    max_input_length = 48
    max_target_length = 128
    prefix = "Generate Python: "    

    # OPTIMIZER
    learning_rate = 3e-4
    weight_decay = 1e-4
    warmup_ratio = 0.2
    adam_epsilon = 1e-8

    # TRAINING
    seed = 2022
    epochs = 20

    # DIRECTORIES
    output_dir = "runs/"
    logging_dir = f"{output_dir}/logs/"
    checkpoint_dir = f"checkpoint"
    save_dir = f"{output_dir}/saved_model/"
    cache_dir = f"{output_dir}/working/"
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    Path(logging_dir).mkdir(parents=True, exist_ok=True)
    Path(save_dir).mkdir(parents=True, exist_ok=True)
    

# initialize training arguments
args = Args()
# initialize logger
logger = init_logger(log_file=os.path.join(args.logging_dir, f"{args.model_type}-{time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())}.log"))
# fix all seeds
fix_all_seeds(args.seed)

if __name__ == "__main__":
    # run training and evaluation
    dataset = run(args)

11/28/2022 08:59:52 - INFO - root -    Starting training / evaluation
11/28/2022 08:59:52 - INFO - root -    Downloading Data Files


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-e210ff1f00908525/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-e210ff1f00908525/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id'],
        num_rows: 500
    })
})


Downloading:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

model_inputs  {'input_ids': [[1, 4625, 6600, 30, 1366, 279, 4277, 1375, 10420, 18, 18513, 27984, 21, 68, 358, 326, 783, 1207, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 4625, 6600, 30, 2495, 279, 3827, 533, 296, 24, 69, 24, 70, 24, 71, 11, 358, 6380, 17, 28, 18, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 4625, 6600, 30, 866, 309, 777, 2186, 316, 666, 1375, 4811, 682, 68, 854, 12529, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 4625, 6600, 30, 740, 1300, 434, 7292, 3086, 2064, 1375, 15774, 9191, 1375, 28288, 471, 1375, 58, 627, 31732, 68, 358, 506, 1375, 3462, 68, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 4625, 6600, 30, 9017, 358, 1765, 279, 533, 628, 5181, 17, 2138, 10593, 358, 6380, 17, 28, 35, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 4625, 6600, 30, 336, 1

11/28/2022 09:00:07 - INFO - root -    Loading Data Files


test dataset after tokenizing  DatasetDict({
    train: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})
cache dir runs//working/
dataset path  ./conala-train.json




  0%|          | 0/1 [00:00<?, ?it/s]

11/28/2022 09:00:08 - INFO - root -    Initializing Tokenizer


dataset -  DatasetDict({
    train: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id'],
        num_rows: 2379
    })
})
dataset before converting to features DatasetDict({
    train: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id'],
        num_rows: 2141
    })
    test: Dataset({
        features: ['intent', 'rewritten_intent', 'snippet', 'question_id'],
        num_rows: 238
    })
})


11/28/2022 09:00:09 - INFO - root -    Preparing Features


  0%|          | 0/3 [00:00<?, ?ba/s]

codes ['sum(d * 10 ** i for i, d in enumerate(x[::-1]))', "r = int(''.join(map(str, x)))", "datetime.strptime('2010-11-13 10:33:54.227806', '%Y-%m-%d %H:%M:%S.%f')", '[(i, sum(j) / len(j)) for i, j in list(d.items())]', 'zip([1, 2], [3, 4])', "['hello{0}'.format(i) for i in a]", "re.sub('(?<!\\\\S)((\\\\S+)(?:\\\\s+\\\\2))(?:\\\\s+\\\\2)+(?!\\\\S)', '\\\\1', s)", 'df.div(df.sum(axis=1), axis=0)', 'map(lambda t: (t[1], t[0]), mylist)', '[(t[1], t[0]) for t in mylist]', 'driver.find_element_by_xpath("//p[@id, \'one\']/following-sibling::p")', 're.findall(\'\\\\[[^\\\\]]*\\\\]|\\\\([^\\\\)]*\\\\)|"[^"]*"|\\\\S+\', strs)', 'print(list(itertools.combinations({1, 2, 3, 4}, 3)))', "df[['hour', 'weekday', 'weeknum']] = df.apply(lambdafunc, axis=1)", "soup.find_all('a', string='Elsie')", "my_datetime.strftime('%B %d, %Y')", "int(''.join(c for c in s if c.isdigit()))", "dic['Test'].update({'class': {'section': 5}})", "dict(map(int, x.split(':')) for x in s.split(','))", 'driver.find_element_by_x

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

11/28/2022 09:00:13 - INFO - root -    Intializing training and validation dataset 
11/28/2022 09:00:13 - INFO - root -    Intializing model | T5 


labels  [[1, 2180, 3292, 5296, 29489, 701, 63, 21, 30, 17, 21, 8009, 701, 18, 4939, 12, 2187, 16063, 9010, 12, 22353, 18, 6485, 2934, 689, 388, 12, 5659, 13, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 2180, 3292, 5296, 29489, 9010, 12, 22353, 18, 6485, 13, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 84, 18, 7048, 1435, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

11/28/2022 09:01:14 - INFO - root -   ***** Running training *****
11/28/2022 09:01:14 - INFO - root -     Num examples = 2141
11/28/2022 09:01:14 - INFO - root -     Num Epochs = 20
11/28/2022 09:01:14 - INFO - root -     Total train batch size (w. parallel & distributed) = 8
11/28/2022 09:01:14 - INFO - root -     Steps per epoch = 267
11/28/2022 09:01:14 - INFO - root -     Total optimization steps = 5340
11/28/2022 09:01:14 - INFO - root -   Epoch 1/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:04:18 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-1
11/28/2022 09:04:18 - INFO - root -   Epoch 2/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:06:12 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-2
11/28/2022 09:06:40 - INFO - root -   Epoch 3/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:08:34 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-3
11/28/2022 09:08:34 - INFO - root -   Epoch 4/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:10:29 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-4
11/28/2022 09:10:56 - INFO - root -   Epoch 5/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:12:51 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-5
11/28/2022 09:12:51 - INFO - root -   Epoch 6/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:14:45 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-6
11/28/2022 09:15:13 - INFO - root -   Epoch 7/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:17:07 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-7
11/28/2022 09:17:07 - INFO - root -   Epoch 8/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:19:02 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-8
11/28/2022 09:19:29 - INFO - root -   Epoch 9/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:21:24 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-9
11/28/2022 09:21:51 - INFO - root -   Epoch 10/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:23:46 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-10
11/28/2022 09:24:13 - INFO - root -   Epoch 11/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:26:08 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-11
11/28/2022 09:26:08 - INFO - root -   Epoch 12/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:28:03 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-12
11/28/2022 09:28:30 - INFO - root -   Epoch 13/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:30:25 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-13
11/28/2022 09:30:25 - INFO - root -   Epoch 14/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:32:20 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-14
11/28/2022 09:32:47 - INFO - root -   Epoch 15/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:34:42 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-15
11/28/2022 09:34:42 - INFO - root -   Epoch 16/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:36:36 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-16
11/28/2022 09:36:36 - INFO - root -   Epoch 17/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:38:31 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-17
11/28/2022 09:38:31 - INFO - root -   Epoch 18/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:40:25 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-18
11/28/2022 09:40:53 - INFO - root -   Epoch 19/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:42:48 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-19
11/28/2022 09:43:15 - INFO - root -   Epoch 20/20


------------- train result -----------------
------------- validation result -----------------


11/28/2022 09:45:09 - INFO - root -   Saving checkpoint at runs/checkpoint/ckpt-20
11/28/2022 09:45:09 - INFO - root -   Training took: 0:43:54.860200
11/28/2022 09:45:09 - INFO - root -    Saving model in runs//saved_model/


In [None]:
def run_predict(args, text):
    # load saved finetuned model
    model = TFT5ForConditionalGeneration.from_pretrained(args.save_dir)
    # load saved tokenizer
    tokenizer = RobertaTokenizer.from_pretrained(args.save_dir) 
    
     # encode texts by prepending the task for input sequence and appending the test sequence
    query = args.prefix + text 
    encoded_text = tokenizer(query, return_tensors='tf', padding='max_length', truncation=True, max_length=args.max_input_length)
    
    # inference
    generated_code = model.generate(
        encoded_text["input_ids"], attention_mask=encoded_text["attention_mask"], 
        max_length=args.max_target_length, top_p=0.95, top_k=50, repetition_penalty=2.0, num_return_sequences=1
    )
    
    # decode generated tokens
    decoded_code = tokenizer.decode(generated_code.numpy()[0], skip_special_tokens=True)
    return decoded_code

def predict_from_dataset(args):
    test_dataset_path = "./conala-test.json"
    dataset = load_dataset('json', data_files=test_dataset_path)
    test_dataset = dataset['train']
    print(test_dataset)
    
    # load using hf datasets
    #dataset = load_dataset('json', data_files='../working/mbpp.jsonl') 
    # train test split
    #dataset = dataset['train'].train_test_split(0.1, shuffle=False) 
    #test_dataset = dataset['test']
    ground_truth = []
    generated_snippet = []
    # randomly select an index from the validation dataset
    for index in range(0, len(test_dataset)):
    #index = random.randint(0, len(test_dataset))
        text = test_dataset[index]['rewritten_intent']
        if(text == None):
            text = test_dataset[index]['intent']
        code = test_dataset[index]['snippet']
        
        # run-predict on text
        decoded_code = run_predict(args, text)
        
        print("#" * 25); print("QUERY: ", text); 
        print()
        print('#' * 25); print("ORIGINAL: "); print("\n", code);
        print()
        print('#' * 25); print("GENERATED: "); print("\n", decoded_code);
        ground_truth.append(code)
        generated_snippet.append(decoded_code)
    return ground_truth, generated_snippet
def predict_from_text(args, text):
    # run-predict on text
    decoded_code = run_predict(args, text)
    print("#" * 25); print("QUERY: ", text); 
    print()
    print('#' * 25); print("GENERATED: "); print("\n", decoded_code);

In [None]:
ground_truth, generated_snippet = predict_from_dataset(args)



  0%|          | 0/1 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

#########################
ORIGINAL: 

 [''.join(str(d) for d in x) for x in L]

#########################
GENERATED: 

 sum(d * 10 ** i for d in L)
#########################
QUERY:  convert a list of lists `L` to list of integers

#########################
ORIGINAL: 

 L = [int(''.join([str(y) for y in x])) for x in L]

#########################
GENERATED: 

 [int(x) for x in L]
#########################
QUERY:  write the elements of list `lines` concatenated by special character '\n' to file `myfile`

#########################
ORIGINAL: 

 myfile.write('\n'.join(lines))

#########################
GENERATED: 

 myfile.write('\n'.join(lines))
#########################
QUERY:  removing an element from a list based on a predicate 'X' or 'N'

#########################
ORIGINAL: 

 [x for x in ['AAT', 'XAC', 'ANT', 'TTA'] if 'X' not in x and 'N' not in x]

#########################
GENERATED: 

 [x for x in my_list if not X.e

In [None]:
import re
from nltk.translate.bleu_score import sentence_bleu
import warnings
warnings.filterwarnings("ignore")

""" The tokenizer that we use for code submissions, from Wang Ling et al., Latent Predictor Networks for Code Generation (2016)
    @param code: string containing a code snippet
    @return: list of code tokens
"""
def tokenize_for_bleu_eval(code):
    code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
    code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
    code = re.sub(r'\s+', ' ', code)
    code = code.replace('"', '`')
    code = code.replace('\'', '`')
    tokens = [t for t in code.split(' ') if t]
    return tokens

""" This scores hypotheses against references using BLEU.
    @param reference_list:  list of ground truth samples
    @param hypothesis_list: list of predictions that a model generates.
    @return: average bleu_score of all the data samples
"""
def evaluate_bleu(reference_list, hypothesis_list):
  bleu_score = 0
  number_of_samples = len(reference_list)
  for index in range(number_of_samples):
    reference_tokens = tokenize_for_bleu_eval(reference_list[index])
    hypothesis_tokens = tokenize_for_bleu_eval(hypothesis_list[index])
    bleu_score += compute_bleu([reference_tokens], hypothesis_tokens)
  return (bleu_score/number_of_samples)*100

def compute_bleu(reference, candidate):
  return sentence_bleu(references=reference, hypothesis=candidate, weights=(1.0, 0.0, 0.0, 0.0))

In [None]:
evaluate_bleu(ground_truth, generated_snippet)

49.059505991204496

Reference : 

https://www.kaggle.com/code/rhtsingh/text-to-code-generation-with-tensorflow-mbpp 

https://huggingface.co/Salesforce/codet5-base

