# Import Required Libraries

In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

from getpass import getpass
os.environ["HF_TOKEN"] = getpass("Enter the Huggingface Token: ")

In [2]:
from datasets import load_dataset
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM)
from transformers import (Trainer, 
                         TrainingArguments)

from IPython.display import Markdown, display

In [19]:
import torch
if torch.cuda.is_available():
    print(f"GPU Name : {torch.cuda.get_device_name(0)}")
    print(f"GPU Version : {torch.version.cuda}")


device = "cuda" if torch.cuda.is_available() else "cpu"

GPU Name : Tesla T4
GPU Version : 12.6


# Load the huggingface dataset

In [3]:
dataset = load_dataset(
    path = "lamini/taylor_swift", 
)

# We are dropping these columns and we recreate them as per our use case
dataset = dataset.remove_columns(["input_ids", "attention_mask", "labels"])
dataset

README.md:   0%|          | 0.00/573 [00:00<?, ?B/s]

data/train-00000-of-00001-54dd04266a81db(…):   0%|          | 0.00/257k [00:00<?, ?B/s]

data/test-00000-of-00001-185d72ed4b72e46(…):   0%|          | 0.00/46.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/783 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/87 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 783
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 87
    })
})

In [4]:
def questionAnswerPairs(example):
    """
        We will be using prompt template for our modelling

    """

    prompt_template = f"""
        Question : 
                {example["question"]}

        Answer : 
                {example["answer"]}
    
    """

    example["prompt"] = prompt_template
 
    return example

# Generate the prompt 
dataset= dataset.map(questionAnswerPairs)

Map:   0%|          | 0/783 [00:00<?, ? examples/s]

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

In [5]:
display(Markdown(dataset["train"][0]["prompt"]))


        Question : 
                What is the controversy surrounding Taylor Swift's music and how has it impacted her career?

        Answer : 
                Taylor Swift has been involved in several controversies throughout her career, including her feud with Kanye West and Kim Kardashian, her lawsuit against a radio DJ who allegedly groped her, and her recent feud with Scooter Braun. These controversies have impacted her career in several ways. First, they have made her a more polarizing figure in the music industry, with some fans supporting her and others criticizing her. Second, they have led to a decrease in her popularity among some listeners, particularly those who do not agree with her political views or her actions in the feuds. Finally, they have led to a decrease of her music being played on some radio stations, which has impacted her ability to reach new audiences
    
    

# 2. Tokenization - Instruction Style Question Answer Pairs

In [39]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

In [40]:
tokenizer.eos_token_id

0

In [9]:
def tokenize_instruction_qa(example, tokenizer, max_len = 2048):
    # Build prompt
    prompt = (
        f"Question: {example['question']} \n"
        f"Answer:"
    )

    answer = example["answer"]

    # Tokenize prompt and answer
    tok_prompt = tokenizer(prompt, add_special_tokens = True)
    tok_answer = tokenizer(answer, add_special_tokens = True)

    input_ids = tok_prompt["input_ids"] + tok_answer["input_ids"] + [tokenizer.eos_token_id]
    attention_mask = [1]*(len(input_ids))

    # Prepare labels (mask prompt tokens)
    labels = [-100] * len(tok_prompt["input_ids"]) + tok_answer["input_ids"] + [tokenizer.eos_token_id]


    # Truncate if needed
    if len(input_ids) > max_len:
        input_ids = input_ids[-max_len:]
        attention_mask = attention_mask[-max_len:]
        labels = labels[-max_len:]

    return {
        "input_ids": input_ids, 
        "attention_mask": attention_mask, 
        "labels": labels
    }


In [10]:
dataset = dataset.map(lambda x : tokenize_instruction_qa(example = x, tokenizer = tokenizer))

Map:   0%|          | 0/783 [00:00<?, ? examples/s]

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

# 3. Load the base Model 

In [45]:
base_model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/pythia-70m",
    device_map = "auto"
)

base_model.to(device) # load the base model to the gpu

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [42]:
base_model.config

GPTNeoXConfig {
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "dtype": "bfloat16",
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "partial_rotary_factor": 0.25,
  "rope_scaling": null,
  "rope_theta": 10000,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "transformers_version": "4.57.3",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

In [None]:
def inference(test, model, tokenizer, max_input_tokens = 1000, max_output_tokens= 100):
    # Tokenize the inference text
    input_ids = tokenizer.encode(
        text, 
        return_tensors = "pt",
        truncation = True,
        max_length = max_input_tokens
    ).to(device)
    
    # Generate the Predictions / Next Token
    predicted_tokens =  model.generate(
        input_ids = input_ids, 
        max_length = max_output_tokens
    )

    # Decode the predicted tokens
    predicted_text =tokenizer.batch_decode(predicted_tokens, skip_special_tokens=True)

    return predicted_text


In [53]:
text = "What is the controversy surrounding Taylor Swift's music and how has it impacted her career?"
display(Markdown(inference(text, base_model, tokenizer)[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What is the controversy surrounding Taylor Swift's music and how has it impacted her career?

The controversy surrounding the song "The New York Times" is a great example of how the song is used to describe the song. The song is a song that is used to describe the song's lyrics. The song is a song that is used to describe the song's lyrics. The song is a song that is used to describe the song's lyrics. The song is a song that is used to describe

In [54]:
# We can clearly observe the response is getting repeated and not up to the mark, so lets how it's gonna answer post fine tuning the model ?

In [60]:
train_dataset, test_dataset = dataset["train"], dataset["test"]


# Just for the similicity we have divided the dataset into train, test dataset 
train_dataset = train_dataset.select_columns(["input_ids", "labels"])
test_dataset = test_dataset.select_columns(["input_ids", "labels"])


# 5. Define the Trainer Arguments

In [63]:
training_args = TrainingArguments(
    # Core training configuration
    seed = 40, # Random seed for initializing, ensuring reproduction
    optim = "adamw_torch", # Optimizer, here it's a AdamW implemented by pytorch
    max_steps = 300, # Number of maximum training steps 
    per_device_train_batch_size = 1, # Batch size per device during training 

    # Other training configuration
    learning_rate = 5e-05, # Initial learning rate for the optimizer
    weight_decay = 0, # Weight decay
    warmup_steps = 10, # Number of steps for the learning rate warmup phase 
    gradient_accumulation_steps = 2,  # Number of steps to accumulate gradients before updating
    bf16 = True, # Use brain float16 for training on supported hardware
    output_dir = "TalorSwiftFineTunedModel", # output dir location

    # logging configuration
    logging_steps = 3, # Frequency of logging training information
    report_to= None # Destination for logging (e.g.., WandB, TensorBoard)
)

In [64]:
# Initialize HuggingFace Trainer 

trainer = Trainer(
    base_model, 
    training_args, 
    train_dataset = train_dataset, 
    eval_dataset = test_dataset, 
)

trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.


Step,Training Loss
3,4.7454
6,5.091
9,3.7808
12,4.1449
15,4.7092
18,5.3805
21,3.8105
24,4.3509
27,4.3604
30,4.6457


TrainOutput(global_step=300, training_loss=3.772832309405009, metrics={'train_runtime': 42.1514, 'train_samples_per_second': 14.234, 'train_steps_per_second': 7.117, 'total_flos': 10420577255424.0, 'train_loss': 3.772832309405009, 'epoch': 0.7662835249042146})

# Save Model Locally

In [65]:
save_dir = 'TaylorShiftFineTunedModel/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: TaylorShiftFineTunedModel/final


In [94]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, 
                                                            dtype = torch.bfloat16,
                                                            local_files_only=True)

finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [76]:
def memory_info():
    " Function to check the total memory available"
    total_memory = torch.cuda.get_device_properties(device).total_memory # Total memory of the GPUs
    allocated_memory = torch.cuda.memory_allocated(device) # currently allocated memory by tensors
    reserved_memory = torch.cuda.memory_reserved(device) # memory reserved by the caching allocator 

    # Free memory (within reserved)
    free_mem = reserved_memory - allocated_memory

    print(f"Total GPU Memory: {total_memory/ 1024**3:.2f} GB")
    print(f"Allocated GPU Memory: {allocated_memory / 1024**3:.2f} GB")
    print(f"Reserved Memory: {reserved_memory / 1024**3:.2f} GB")
    print(f"Free (within reserved): {free_mem / 1024**3:.2f} GB")

memory_info()

Total GPU Memory: 14.74 GB
Allocated GPU Memory: 0.55 GB
Reserved Memory: 1.68 GB
Free (within reserved): 1.13 GB


In [80]:
finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [95]:
text = "What is the controversy surrounding Taylor Swift's music and how has it impacted her career?"
display(Markdown(inference(text, finetuned_slightly_model, tokenizer)[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


tensor([[ 1276,   310,   253, 16305,  8704, 11276, 24619,   434,  3440,   285,
           849,   556,   352, 27857,   617,  5249,    32]], device='cuda:0')
tensor([[ 1276,   310,   253, 16305,  8704, 11276, 24619,   434,  3440,   285,
           849,   556,   352, 27857,   617,  5249,    32,     0]],
       device='cuda:0')


What is the controversy surrounding Taylor Swift's music and how has it impacted her career?

In [90]:
def inference(test, model, tokenizer, max_input_tokens = 1000, max_output_tokens= 100):
    # Tokenize the inference text
    input_ids = tokenizer.encode(
        text, 
        return_tensors = "pt",
        truncation = True,
        max_length = max_input_tokens
    ).to(device)
    
    print(input_ids)
    # Generate the Predictions / Next Token
    predicted_tokens =  model.generate(
        input_ids = input_ids, 
        max_length = max_output_tokens
    )

    print(predicted_tokens)
    # Decode the predicted tokens
    predicted_text =tokenizer.batch_decode(predicted_tokens, skip_special_tokens=True)

    return predicted_text
