In [None]:
# ! pip install datasets transformers[sentencepiece] sacrebleu peft -q

## Imports

In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import evaluate
from tqdm.auto import tqdm

In [2]:
device = "cuda"

In [3]:
data = pd.read_json("../data/datafinal.json")

### Training the Labelizer

In [4]:
mlb = MultiLabelBinarizer()
mlb.fit(data["CodeList"])

In [26]:
train_data = data[data["ContainsCode"].apply(lambda x: isinstance(x, bool))].copy()

### removing bad rows from eda notebook

In [27]:
bad_rows = [54, 150, 162, 600, 712, 1599, 1603, 1609, 1611, 1636, 1640, 1718, 1870, 1876, 1879, 1880, 160, 716, 1716, 442, 436, 718]

In [28]:
train_data = train_data.drop(bad_rows)

In [29]:
# model_checkpoint = "Salesforce/codet5-base"
# Salesforce/codet5p-220m
# Salesforce/codegen2-1B
# Salesforce/codet5p-770m

### adding bleu metric apart from given one to get a better analysis of results

In [30]:
model_checkpoint = "Salesforce/codet5p-220m"
metric = evaluate.load("google_bleu")

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### checking if all the required token are there in my tokenizer

In [32]:
for t in mlb.classes_:
    if t != tokenizer.decode(tokenizer(t)["input_ids"], skip_special_tokens=True):
        print("tokenizer not supported")
        break

In [33]:
train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data["CodeList"] = train_data["CodeList"].apply(lambda x: x.replace('\\n', '\n').replace('\\t', '\t'))

In [34]:
valid_data = train_data.iloc[1200:]
train_data = train_data.iloc[:1200]

In [36]:
# Load datasets
train_dataset = Dataset.from_pandas(train_data[["Text", "CodeList"]].reset_index())
valid_dataset = Dataset.from_pandas(valid_data[["Text", "CodeList"]].reset_index())

### preprocess function to create a Text - Code pair

In [37]:
def preprocess_function_real(examples):
    inputs = examples["Text"]
    targets = examples["CodeList"]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(text_target=targets)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [38]:
train_tokenized_datasets = train_dataset.map(preprocess_function_real, batched=True)
valid_tokenized_datasets = valid_dataset.map(preprocess_function_real, batched=True)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

### preprocess function to create a code-code pair

In [39]:
def preprocess_function_experiment(examples):
    inputs = examples["CodeList"]
    targets = examples["CodeList"]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(text_target=targets)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [41]:
experiment_train_tokenized_datasets = train_dataset.map(preprocess_function_experiment, batched=True)
experiment_valid_tokenized_datasets = valid_dataset.map(preprocess_function_experiment, batched=True)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/155 [00:00<?, ? examples/s]

In [43]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

In [44]:
model.generation_config.max_length = 512

In [45]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 222882048
all model parameters: 222882048
percentage of trainable model parameters: 100.00%


## applying lora config for flan training to get max throughput

In [46]:
# from peft import LoraConfig, get_peft_model, TaskType

# lora_config = LoraConfig(
#     r=768, # Rank
#     lora_alpha=1536,
#     target_modules=["q", "v"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
# )

In [47]:
# model = get_peft_model(model, lora_config)
# print(print_number_of_trainable_model_parameters(model))

In [48]:
batch_size = 4

In [49]:
args = Seq2SeqTrainingArguments(
    f"new_model",
    evaluation_strategy = "steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    load_best_model_at_end=True,
    eval_steps=300,
    save_steps=300,
    fp16=True,
    report_to="none"
)

In [50]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### creating compute metric function with competiton eval metric and bleu score

In [51]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    result = {"bleu": result["google_bleu"]}
    
    s1 = []
    s2 = []
    
    for x in decoded_preds:
        s1.append(x.replace('\n', '\\n').replace('\t', '\\t'))
    
    for x in decoded_labels:
        s2.append(x[0].replace('\n', '\\n').replace('\t', '\\t'))
    
    t1 = mlb.transform(s1)
    t2 = mlb.transform(s2)
    
    result["mlb_score"] = accuracy_score(t1, t2)
    
    result = {k: round(v, 4) for k, v in result.items()}

    return result

### First Training where model learns to replicate code

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=experiment_train_tokenized_datasets,
    eval_dataset=experiment_valid_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [54]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bleu,Mlb Score
300,No log,0.002004,0.9916,1.0


TrainOutput(global_step=300, training_loss=0.1088137690226237, metrics={'train_runtime': 96.4105, 'train_samples_per_second': 12.447, 'train_steps_per_second': 3.112, 'total_flos': 33663953387520.0, 'train_loss': 0.1088137690226237, 'epoch': 1.0})

### Second Training where model runs to extract code

In [55]:
args = Seq2SeqTrainingArguments(
    f"new_model",
    evaluation_strategy = "steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    load_best_model_at_end=True,
    eval_steps=300,
    save_steps=300,
    fp16=True,
    report_to="none"
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=valid_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [57]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Mlb Score
300,No log,0.08291,0.5798,0.9484
600,0.098100,0.094837,0.9061,0.9419
900,0.098100,0.068409,0.856,0.9355
1200,0.035100,0.060057,0.6674,0.9484
1500,0.012400,0.060649,0.9056,0.9226
1800,0.012400,0.061253,0.9298,0.9484
2100,0.008300,0.063732,0.9401,0.9613
2400,0.008300,0.056599,0.9467,0.9548
2700,0.002600,0.060052,0.9467,0.9548
3000,0.000900,0.060681,0.9467,0.9548


TrainOutput(global_step=3000, training_loss=0.026242080877224603, metrics={'train_runtime': 860.9734, 'train_samples_per_second': 13.938, 'train_steps_per_second': 3.484, 'total_flos': 2128888240496640.0, 'train_loss': 0.026242080877224603, 'epoch': 10.0})

In [45]:
model.save_pretrained("./t5_experiment_1")