In [1]:
#install packages
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
#load datasets
from datasets import load_dataset
import pandas as pd
from datasets import Dataset

In [3]:
#get dataset
raw_data = load_dataset('xiyuez/red-dot-design-award-product-description')
#print to see format
print(raw_data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['product', 'category', 'description', 'text'],
        num_rows: 21183
    })
})


In [4]:
#convert to dataframe
raw_df = pd.DataFrame(raw_data['train'])
#print few rows
raw_df.head()

Unnamed: 0,product,category,description,text
0,Biamp Rack Products,Digital Audio Processors,"“High recognition value, uniform aesthetics an...",Product Name: Biamp Rack Products;\n\nProduct ...
1,V33,Video Camera,The V33 livestreaming video camera ensures hig...,Product Name: V33;\n\nProduct Category: Video ...
2,HP LaserJet 5000-6000 and E700-E800 Series MFPs,Multi-Function Printers,The HP LaserJet 5000 to 6000 Series and E700 t...,Product Name: HP LaserJet 5000-6000 and E700-E...
3,Meaco Arete One 20L Dehumidifier,Heating and Air Conditioning Technology,The Meaco Arete One Dehumidifier is characteri...,Product Name: Meaco Arete One 20L Dehumidifier...
4,théATRE Glass Container for Loose Leaf Tea,Food Containers,The design and colouring of the théATRE Glass ...,Product Name: théATRE Glass Container for Loos...


In [5]:
#add an instruction column to the dataset
raw_df['instruction'] = 'Create a description for the product: ' + raw_df['product'] + 'belonging to category' + raw_df['category']
#change data
raw_df = raw_df[['instruction', 'description']]
#print data
raw_df.head()

Unnamed: 0,instruction,description
0,Create a description for the product: Biamp Ra...,"“High recognition value, uniform aesthetics an..."
1,Create a description for the product: V33belon...,The V33 livestreaming video camera ensures hig...
2,Create a description for the product: HP Laser...,The HP LaserJet 5000 to 6000 Series and E700 t...
3,Create a description for the product: Meaco Ar...,The Meaco Arete One Dehumidifier is characteri...
4,Create a description for the product: théATRE ...,The design and colouring of the théATRE Glass ...


In [6]:
#convert data to a prompt-response
template = """ Given below is an instruction. You are to follow the instruction.
               Instruction : {}
               Response:
"""
raw_df['prompt'] = raw_df['instruction'].apply(lambda x: template.format(x))
raw_df.rename(columns = {'description' : 'response'}, inplace = True)
#combine prompt and response to text
raw_df['text'] = raw_df['prompt'] + raw_df['response']
raw_df.drop(columns = ['prompt', 'response'], inplace = True)
raw_df.head()

Unnamed: 0,instruction,text
0,Create a description for the product: Biamp Ra...,Given below is an instruction. You are to fol...
1,Create a description for the product: V33belon...,Given below is an instruction. You are to fol...
2,Create a description for the product: HP Laser...,Given below is an instruction. You are to fol...
3,Create a description for the product: Meaco Ar...,Given below is an instruction. You are to fol...
4,Create a description for the product: théATRE ...,Given below is an instruction. You are to fol...


In [7]:
#choose a random sample
raw_df = raw_df.sample(100)
print(len(raw_df))

100


In [8]:
#generate an example to see why we need fine tuning
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
#instantiate model
checkpoint = 'facebook/opt-6.7b'
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    load_in_8bit = True,
    device_map = 'auto'
)
tokeniser = AutoTokenizer.from_pretrained(checkpoint)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
#showcase result on an example
prompt = raw_df.iloc[0]['instruction']
print(prompt)
input_ids = tokeniser(prompt, return_tensors = 'pt').input_ids
output = model.generate(
    input_ids = input_ids,
    max_new_tokens = 45
)
output = tokeniser.decode(output[0])
print(output)

Create a description for the product: Headband Headlampbelonging to categoryLighting Device




</s>Create a description for the product: Headband Headlampbelonging to categoryLighting Device

Create a description for the product: Headband Headlampbelonging to categoryLighting Device

Create a description for the product: Headband Headlampbelonging to categoryLighting Device



In [10]:
#convert to huggingface data
dataset = Dataset.from_pandas(raw_df)
print(dataset)

Dataset({
    features: ['instruction', 'text', '__index_level_0__'],
    num_rows: 100
})


In [11]:
#tokenize text in dataset
dataset = dataset.map(lambda x : tokeniser(x['text']), batched = True)
print(dataset)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 100
})


In [12]:
#add lora weights
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r = 16,
    lora_alpha = 32,
    target_modules = ['q_proj', 'v_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)
model = get_peft_model(model, config)

In [13]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
trainer = Trainer(
    model = model,
    train_dataset = dataset,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 1,
        max_steps = 1,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 1,
        output_dir = 'outputs'
    ),
    data_collator = DataCollatorForLanguageModeling(tokeniser, mlm=False)
)

In [14]:
model.config.use_cache = False
trainer.train()



Step,Training Loss
1,2.621


TrainOutput(global_step=1, training_loss=2.620983362197876, metrics={'train_runtime': 2.3263, 'train_samples_per_second': 0.43, 'train_steps_per_second': 0.43, 'total_flos': 6310595543040.0, 'train_loss': 2.620983362197876, 'epoch': 0.01})

In [15]:
#showcase result on the old example
prompt = raw_df.iloc[0]['instruction']
print(prompt)
input_ids = tokeniser(prompt, return_tensors = 'pt').input_ids
output = model.generate(
    input_ids = input_ids,
    max_new_tokens = 45
)
output = tokeniser.decode(output[0])
print(output)

Create a description for the product: Headband Headlampbelonging to categoryLighting Device




</s>Create a description for the product: Headband Headlampbelonging to categoryLighting Device

Description

The headband headlamp is a great accessory for your bike. It is a compact and lightweight headlamp that is easy to use and fits comfortably on your head. The headband headl
