In [2]:
#@title installs
!pip install -q peft transformers datasets

In [3]:
#@title drive connect
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#@title imports

import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from transformers import GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling

from datasets import Dataset

from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# from transformers import GPT2LMHeadModel
from peft import PeftModel, PeftConfig


In [5]:
#@title CSV FILE PATH  variable definition
df = pd.read_csv("/content/drive/MyDrive/MusicProject/data_clean_en_3.csv")

In [6]:
#@title Data reading from csv
lyrics = df["lyrics_clean_with_newline"].to_list()
descriptions = df["description"].to_list()

data = {
    "lyrics": lyrics,
    "description": descriptions,
}

# Create a Hugging Face dataset
dataset = Dataset.from_dict(data)

dataset = dataset.train_test_split(test_size=0.2)

In [7]:
#@title tokenizer init
max_length = 512
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", max_length=max_length, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
#@title data pre processing

def preprocess_function(examples):
  inputs = '<|startoftext|> '+examples['description']+'. Lyrics according to the description :  '+examples['lyrics']+'<|endoftext|>'
  model_inputs = tokenizer(inputs, truncation=True, max_length=512, padding="max_length")
  return model_inputs

tokenized_webnlg_train = dataset["train"].map(preprocess_function, batched=False, remove_columns=dataset["train"].column_names)

tokenized_webnlg_test = dataset["test"].map(preprocess_function, batched=False, remove_columns=dataset["test"].column_names)

# tokenized_webnlg_test = dataset_test.map(preprocess_function, batched=False, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/2046 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

In [9]:
#@title model initialization
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model.resize_token_embeddings(len(tokenizer))


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50259. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


trainable params: 1,572,864 || all params: 1,317,148,672 || trainable%: 0.11941431012580485


Embedding(50259, 2048)

In [None]:
#@title training
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/MusicProject/notebooks/model_trained_2/",
    evaluation_strategy="epoch",
    num_train_epochs=10.0,
    learning_rate=2e-5,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    per_gpu_train_batch_size = 1,
    per_gpu_eval_batch_size = 1,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=len(tokenized_webnlg_train),
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_webnlg_train,
    eval_dataset=tokenized_webnlg_test,
    data_collator=data_collator,
)

trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Epoch,Training Loss,Validation Loss
1,2.2222,2.175347
2,2.0416,2.157179
3,1.9659,2.15223
4,1.908,2.154097


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_de

In [None]:
MODEL_PATH = "/content/drive/MyDrive/MusicProject/notebooks/models_trained"


In [None]:
# trainer.save_model(MODEL_PATH)

In [13]:
len(tokenized_webnlg_train)

2046

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
 [2046/2046 36:53, Epoch 1/1]
Epoch	Training Loss	Validation Loss
1	2.295100	2.253794
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
TrainOutput(global_step=2046, training_loss=2.437497602296831, metrics={'train_runtime': 2217.8099, 'train_samples_per_second': 0.923, 'train_steps_per_second': 0.923, 'total_flos': 7605403709865984.0, 'train_loss': 2.437497602296831, 'epoch': 1.0})

## **Inference**

In [None]:
MODEL_PATH = "/content/drive/MyDrive/MusicProject/notebooks/models_trained"

In [None]:
config = PeftConfig.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, MODEL_PATH)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", max_length=512, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
prompt = 'A song of ice and fire => '

inputs = tokenizer(prompt, return_tensors="pt")

In [None]:
outputs = model.generate(**inputs, max_length=512)

x = tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
print(x[0])

A song of ice and fire => 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of ice and fire are burning bright, 
The flames of

## Debug code:

In [14]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", max_length=512, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
prompt = 'A song of ice and fire => '

inputs = tokenizer(prompt, return_tensors="pt")

In [16]:
outputs = model.generate(**inputs, max_length=512)

x = tokenizer.batch_decode(outputs, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [17]:
print(x[0])

A song of ice and fire => 



In [18]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoForCausalLM(
      (transformer): GPTNeoModel(
        (wte): Embedding(50259, 2048)
        (wpe): Embedding(2048, 2048)
        (drop): Dropout(p=0.0, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPTNeoBlock(
            (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attn): GPTNeoAttention(
              (attention): GPTNeoSelfAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
                (resid_dropout): Dropout(p=0.0, inplace=False)
                (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
                (v_proj): Linear(
                  in_features=2048, out_features=2048, bias=False
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=2048, out_features=