In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline, logging, T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
import torch

In [2]:
MODEL_NAME = "google/flan-t5-base"

quant_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=False)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, device_map="auto", quantization_config=quant_config)

In [3]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

In [4]:
model = get_peft_model(model, peft_params)
model.print_trainable_parameters()

trainable params: 7,077,888 || all params: 254,655,744 || trainable%: 2.779394601050114


In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

llama_3_data = pd.read_parquet("data/data_cleaned.parquet", engine="pyarrow")
train_df, test_df = train_test_split(llama_3_data, test_size=0.2, random_state=42)
train_df

Unnamed: 0,filename,notebook_data,line_count,char_count,question,answer
162,1000910.ipynb,# This Python 3 environment comes with many he...,14,698,Summarize the following code in two to three s...,"The code imports various libraries, including ..."
1001,1006787.ipynb,# This R environment comes with all of CRAN pr...,18,923,Summarize the following code in two to three s...,The code intends to load necessary packages fo...
1718,1011203.ipynb,# This R environment comes with all of CRAN pr...,18,1227,Summarize the following code in two to three s...,The code intends to load and analyze a dataset...
1003,1006791.ipynb,# This R environment comes with all of CRAN pr...,58,2139,Summarize the following code in two to three s...,The code intends to explore and preprocess a c...
1233,1008495.ipynb,from sklearn.ensemble import RandomForestRegre...,73,2750,Summarize the following code in two to three s...,The code is building a random forest regressio...
...,...,...,...,...,...,...
1095,1007446.ipynb,- [IBU][1]: The International Bittering Units...,61,3410,Summarize the following code in two to three s...,The code aims to analyze the relationship betw...
1130,1007799.ipynb,# This Python 3 environment comes with many he...,27,1091,Summarize the following code in two to three s...,The code intends to train a Support Vector Mac...
1294,1008781.ipynb,import pandas as pd\nimport seaborn as sns\nim...,25,1310,Summarize the following code in two to three s...,The code intends to analyze and visualize glob...
860,1005684.ipynb,## This notebook demos Python data visualizati...,91,5436,Summarize the following code in two to three s...,The code intends to demonstrate various data v...


In [6]:
from datasets import Dataset, DatasetDict
train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['filename', 'notebook_data', 'line_count', 'char_count', 'question', 'answer'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['filename', 'notebook_data', 'line_count', 'char_count', 'question', 'answer'],
        num_rows: 700
    })
})

In [7]:
def tokenize_function(example):
    example['input_ids'] = tokenizer(example['question'], padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["answer"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['filename', 'notebook_data', 'line_count', 'char_count', 'question', 'answer', 'input_ids', 'labels'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['filename', 'notebook_data', 'line_count', 'char_count', 'question', 'answer', 'input_ids', 'labels'],
        num_rows: 700
    })
})

In [9]:
print(tokenized_dataset['train']['labels'][0])

[37, 1081, 4830, 7, 796, 12256, 6, 379, 1174, 51, 345, 63, 11, 28248, 7, 6, 21, 331, 3026, 11, 1693, 5, 94, 92, 4830, 7, 8, 1510, 7, 9434, 291, 29, 3595, 6, 15495, 24, 8, 1081, 8286, 7, 12, 1912, 1437, 1036, 4145, 5, 37, 1081, 3475, 12, 36, 356, 95, 12, 2174, 331, 2073, 16, 8, 96, 5, 5, 87, 77, 2562, 87, 121, 8174, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
tokenized_dataset = tokenized_dataset.remove_columns(dataset["train"].column_names)

In [11]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 700
    })
})

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    # disable_tqdm=False,
    # evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3
)

def formatting_func(example):
    text = f"### Question: {example['question']}\n ### Answer: {example['answer']}"
    # text = f"Question: {example['question']}\nAnswer: {example['answer']}"
    return text

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    peft_config=peft_params,
    # dataset_text_field="text",
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
    # packing=True,
)

In [13]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=525, training_loss=0.35078078497023807, metrics={'train_runtime': 3933.7726, 'train_samples_per_second': 2.135, 'train_steps_per_second': 0.133, 'total_flos': 5934605244825600.0, 'train_loss': 0.35078078497023807, 'epoch': 3.0})

In [14]:
new_model = "flan-t5-base-fine-tuned"

trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('flan-t5-base-fine-tuned\\tokenizer_config.json',
 'flan-t5-base-fine-tuned\\special_tokens_map.json',
 'flan-t5-base-fine-tuned\\spiece.model',
 'flan-t5-base-fine-tuned\\added_tokens.json',
 'flan-t5-base-fine-tuned\\tokenizer.json')