# Building a model for our application



## Import libraries

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, get_cosine_schedule_with_warmup
from datasets import load_dataset
import pathlib
import torch

In [9]:
dataset = load_dataset("csv", data_files="./data/spotify_millsongdata.csv")['train']
dataset = dataset.filter(lambda example: 'abba' in example["artist"].lower())
dataset = dataset.remove_columns(["artist", "song", "link"])
dataset = dataset.shuffle(seed=123)
dataset

Using custom data configuration default-a497b4a7fe246336
Found cached dataset csv (/Users/renaef/.cache/huggingface/datasets/csv/default-a497b4a7fe246336/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/renaef/.cache/huggingface/datasets/csv/default-a497b4a7fe246336/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-60ed97895a9cb5f3.arrow
Loading cached shuffled indices for dataset at /Users/renaef/.cache/huggingface/datasets/csv/default-a497b4a7fe246336/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-14a16ae9c8240c30.arrow


Dataset({
    features: ['text'],
    num_rows: 269
})

In [10]:
dataset_dict = dataset.train_test_split(test_size=0.1)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 242
    })
    test: Dataset({
        features: ['text'],
        num_rows: 27
    })
})

In [11]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2', cache_dir=pathlib.Path('cache').resolve())

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])
block_size = tokenizer.model_max_length

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
output_dir = "output"
seed = 123
LEARNING_RATE = 1.372e-4
num_train_epochs = 4

training_args = TrainingArguments(
    output_dir,
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    num_train_epochs=num_train_epochs,
    save_total_limit=10,
    save_strategy='epoch',
    save_steps=1,
    report_to=None,
    seed=seed,
    logging_steps=5,
    do_eval=True,
    eval_steps=1,
    load_best_model_at_end=True
    # disable_tqdm=True
    # load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    # tokenizer=tokenizer,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"]
)

train_dataloader = trainer.get_train_dataloader()
num_train_steps = len(train_dataloader)
trainer.create_optimizer_and_scheduler(num_train_steps)
trainer.lr_scheduler = get_cosine_schedule_with_warmup(
    trainer.optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)

trainer.train()
evaluation = trainer.evaluate()

***** Running training *****
  Num examples = 97
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 52
  Number of trainable parameters = 124439808


  0%|          | 0/52 [00:00<?, ?it/s]

{'loss': 3.4212, 'learning_rate': 9.292589525111794e-05, 'epoch': 0.38}
{'loss': 2.6389, 'learning_rate': 1.725216267546246e-05, 'epoch': 0.77}


***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


  0%|          | 0/2 [00:00<?, ?it/s]

Saving model checkpoint to output/checkpoint-13
Configuration saved in output/checkpoint-13/config.json


{'eval_loss': 2.2880771160125732, 'eval_runtime': 9.4581, 'eval_samples_per_second': 1.057, 'eval_steps_per_second': 0.211, 'epoch': 1.0}


Model weights saved in output/checkpoint-13/pytorch_model.bin


{'loss': 2.5624, 'learning_rate': 7.857716640189785e-06, 'epoch': 1.15}
{'loss': 2.4878, 'learning_rate': 7.686881626551516e-05, 'epoch': 1.54}
{'loss': 2.3779, 'learning_rate': 0.00013520660867542716, 'epoch': 1.92}


***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


  0%|          | 0/2 [00:00<?, ?it/s]

Saving model checkpoint to output/checkpoint-26
Configuration saved in output/checkpoint-26/config.json


{'eval_loss': 2.011009693145752, 'eval_runtime': 9.1364, 'eval_samples_per_second': 1.095, 'eval_steps_per_second': 0.219, 'epoch': 2.0}


Model weights saved in output/checkpoint-26/pytorch_model.bin


{'loss': 2.0717, 'learning_rate': 0.00010756924162575734, 'epoch': 2.31}
{'loss': 1.9569, 'learning_rate': 2.9630758374242683e-05, 'epoch': 2.69}


***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


  0%|          | 0/2 [00:00<?, ?it/s]

Saving model checkpoint to output/checkpoint-39
Configuration saved in output/checkpoint-39/config.json


{'eval_loss': 1.8960673809051514, 'eval_runtime': 9.3112, 'eval_samples_per_second': 1.074, 'eval_steps_per_second': 0.215, 'epoch': 3.0}


Model weights saved in output/checkpoint-39/pytorch_model.bin


{'loss': 1.9482, 'learning_rate': 1.9933913245728396e-06, 'epoch': 3.08}
{'loss': 1.9393, 'learning_rate': 6.033118373448485e-05, 'epoch': 3.46}
{'loss': 1.8375, 'learning_rate': 0.00012934228335981018, 'epoch': 3.85}


***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


  0%|          | 0/2 [00:00<?, ?it/s]

Saving model checkpoint to output/checkpoint-52
Configuration saved in output/checkpoint-52/config.json


{'eval_loss': 1.8812570571899414, 'eval_runtime': 9.1952, 'eval_samples_per_second': 1.088, 'eval_steps_per_second': 0.218, 'epoch': 4.0}


Model weights saved in output/checkpoint-52/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from output/checkpoint-52 (score: 1.8812570571899414).
***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


{'train_runtime': 2272.3501, 'train_samples_per_second': 0.171, 'train_steps_per_second': 0.023, 'train_loss': 2.3043749240728526, 'epoch': 4.0}


  0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
start = "You"
num_sequences =  10
min_length =  100
max_length =   160
temperature = 1
top_p = 0.95
top_k = 50
repetition_penalty =  1.0

encoded_prompt = tokenizer(start, add_special_tokens=False, return_tensors="pt").input_ids
encoded_prompt = encoded_prompt.to(trainer.model.device)

# prediction
output_sequences = trainer.model.generate(
    input_ids=encoded_prompt,
    max_length=max_length,
    min_length=min_length,
    temperature=float(temperature),
    top_p=float(top_p),
    top_k=int(top_k),
    do_sample=True,
    repetition_penalty=repetition_penalty,
    num_return_sequences=num_sequences)

def post_process(output_sequences):
    generated_sequences = []

    max_repeat = 2

    # decode prediction
    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        generated_sequence = generated_sequence.tolist()
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True, skip_special_tokens=True)
        generated_sequences.append(text.strip())
                    
    return generated_sequences

post_process(output_sequences)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['You be careful of him you\'re gonna think you\'re gonna save me, I\'ll burn the sun, and I\'ll fight you, baby boy!"\n  \r\nThat\'s what I think\n  \r\nDangerous things are always like that  \r\nDangerous things are always like that  \r\nDon\'t get fooled by fools  \r\n  \r\nOh, but it ain\'t gonna happen  \r\nWell then I just wanna take my chances, then  \r\nYeah, yeah.  \r\n  \r\nHey hey don\'t touch me  \r\n  \r\nDon\'t touch me girl  \r\nDon\'t touch me girl girl  \r\nDon\'t touch me girl',
 "You was running late for a run, I think she said something funny and so I thought something funny. The feeling I had is gone now, I think. I took her by the hand, smiled in front of her eyes and told her you were right. I'm so proud of you I'm almost crying. I'll never forget the feeling I had. It's so wrong and I want you to know I'm proud of you, I am.  \r\nIn the face of all the power, in the face of death, in a blind alley, it's all a dream.  \r\nWalking through the streets of New York, 

# Sources

- Chapter 7: Deep Learning for Text, Deep Learning in Python by François Chollet
- GPT-2, Accessed at: https://huggingface.co/gpt2
- OpenAPI GPT-2, Accessed at: https://huggingface.co/docs/transformers/main/en/model_doc/gpt2
- Fine-tune a pretrained model, Accessed at: https://huggingface.co/docs/transformers/training#train-in-native-pytorch
- HuggingArtists - Train a model to generate lyrics, Accessed at: https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb
- HuggingTweets - Train a model to generate tweets, Accessed at: https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb
- Spotify Million Song Dataset, Accessed at: https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset
- Datasets, Accessed at: https://huggingface.co/docs/datasets/index