In [93]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
import torch

In [94]:
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [95]:
def load_and_process_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    dialogues = [line.strip().split('|||') for line in lines if line.strip()]
    # Process each dialogue pair
    processed_examples = []
    for dialogue in dialogues:
        customer, agent = dialogue
        combined_text = customer + tokenizer.eos_token + agent + tokenizer.eos_token
        tokenized = tokenizer(combined_text, truncation=True, max_length=512, padding='max_length')
        # Ensure the 'labels' are correctly set for loss computation
        tokenized['labels'] = tokenized['input_ids'].copy()
        processed_examples.append(tokenized)

    # Creating a DataFrame
    df = pd.DataFrame(processed_examples)

    train_df, eval_df = train_test_split(df, test_size=0.1)
    train_dataset = Dataset.from_pandas(train_df)
    eval_dataset = Dataset.from_pandas(eval_df)

    return train_dataset, eval_dataset

In [96]:
# Custom collate function to ensure dynamic padding
def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_ids_padded = pad_sequence([torch.tensor(seq) for seq in input_ids], 
                                    batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence([torch.tensor(seq) for seq in labels], 
                                 batch_first=True, padding_value=tokenizer.pad_token_id)

    return {'input_ids': input_ids_padded, 'labels': labels_padded}


In [97]:
# Load and process the dataset
train_dataset, eval_dataset = load_and_process_data('nsp_fine_tuning_dataset.txt')


In [98]:
import distutils
print("Distutils module is found!")


Distutils module is found!


In [99]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=custom_collate_fn,  # Use the custom collate function
)


In [100]:
import os
os.environ["WANDB_NOTEBOOK_NAME"] = "gpt2_nsp_trainer.ipynb"


In [101]:
import wandb
wandb.login(key="a9433891c8b370b61ad36f3b5c379d41d79ff4dd")




True

In [102]:
# Train the model
trainer.train()

  2%|▏         | 10/540 [01:24<1:14:24,  8.42s/it]

{'loss': 12.043, 'grad_norm': 243.45953369140625, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.06}


  4%|▎         | 20/540 [02:50<1:15:16,  8.69s/it]

{'loss': 8.8652, 'grad_norm': 243.82521057128906, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.11}


  6%|▌         | 30/540 [04:19<1:14:09,  8.72s/it]

{'loss': 3.2441, 'grad_norm': 68.19136047363281, 'learning_rate': 3e-06, 'epoch': 0.17}


  7%|▋         | 40/540 [05:46<1:11:58,  8.64s/it]

{'loss': 0.767, 'grad_norm': 10.732428550720215, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.22}


  9%|▉         | 50/540 [07:15<1:11:20,  8.74s/it]

{'loss': 0.3361, 'grad_norm': 3.365427017211914, 'learning_rate': 5e-06, 'epoch': 0.28}


 11%|█         | 60/540 [08:44<1:10:29,  8.81s/it]

{'loss': 0.2211, 'grad_norm': 1.7395522594451904, 'learning_rate': 6e-06, 'epoch': 0.33}


 13%|█▎        | 70/540 [10:27<1:20:15, 10.25s/it]

{'loss': 0.1922, 'grad_norm': 1.655815601348877, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.39}


 15%|█▍        | 80/540 [11:59<1:10:37,  9.21s/it]

{'loss': 0.1839, 'grad_norm': 1.5169758796691895, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.44}


 17%|█▋        | 90/540 [13:29<1:07:49,  9.04s/it]

{'loss': 0.1534, 'grad_norm': 1.647887945175171, 'learning_rate': 9e-06, 'epoch': 0.5}


 19%|█▊        | 100/540 [15:25<1:28:48, 12.11s/it]

{'loss': 0.1428, 'grad_norm': 1.6191424131393433, 'learning_rate': 1e-05, 'epoch': 0.56}


 20%|██        | 110/540 [18:01<2:08:00, 17.86s/it]

{'loss': 0.1329, 'grad_norm': 1.8863918781280518, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.61}


 22%|██▏       | 120/540 [20:10<1:22:36, 11.80s/it]

{'loss': 0.131, 'grad_norm': 1.7237969636917114, 'learning_rate': 1.2e-05, 'epoch': 0.67}


 24%|██▍       | 130/540 [23:07<1:59:44, 17.52s/it]

{'loss': 0.1093, 'grad_norm': 1.3728928565979004, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.72}


 26%|██▌       | 140/540 [25:45<1:43:16, 15.49s/it]

{'loss': 0.1099, 'grad_norm': 1.3631051778793335, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.78}


 28%|██▊       | 150/540 [27:16<58:46,  9.04s/it]  

{'loss': 0.1026, 'grad_norm': 1.3798378705978394, 'learning_rate': 1.5e-05, 'epoch': 0.83}


 30%|██▉       | 160/540 [28:52<59:35,  9.41s/it]  

{'loss': 0.1061, 'grad_norm': 1.5488201379776, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.89}


 31%|███▏      | 170/540 [30:18<53:50,  8.73s/it]

{'loss': 0.1084, 'grad_norm': 1.915779709815979, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.94}


 33%|███▎      | 180/540 [31:45<52:12,  8.70s/it]

{'loss': 0.0987, 'grad_norm': 1.4600424766540527, 'learning_rate': 1.8e-05, 'epoch': 1.0}


                                                 
 33%|███▎      | 180/540 [32:52<52:12,  8.70s/it]

{'eval_loss': 0.08530990779399872, 'eval_runtime': 66.6474, 'eval_samples_per_second': 0.6, 'eval_steps_per_second': 0.3, 'epoch': 1.0}


 35%|███▌      | 190/540 [34:20<56:32,  9.69s/it]  

{'loss': 0.0961, 'grad_norm': 1.4941197633743286, 'learning_rate': 1.9e-05, 'epoch': 1.06}


 37%|███▋      | 200/540 [35:56<58:36, 10.34s/it]

{'loss': 0.0873, 'grad_norm': 1.4442427158355713, 'learning_rate': 2e-05, 'epoch': 1.11}


 39%|███▉      | 210/540 [37:39<55:49, 10.15s/it]

{'loss': 0.076, 'grad_norm': 1.3697291612625122, 'learning_rate': 2.1e-05, 'epoch': 1.17}


 41%|████      | 220/540 [39:01<43:04,  8.08s/it]

{'loss': 0.0811, 'grad_norm': 1.6251477003097534, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.22}


 43%|████▎     | 230/540 [40:24<42:38,  8.25s/it]

{'loss': 0.074, 'grad_norm': 1.5778028964996338, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.28}


 44%|████▍     | 240/540 [41:45<40:48,  8.16s/it]

{'loss': 0.0753, 'grad_norm': 1.3190491199493408, 'learning_rate': 2.4e-05, 'epoch': 1.33}


 46%|████▋     | 250/540 [43:06<40:14,  8.33s/it]

{'loss': 0.0865, 'grad_norm': 1.6062391996383667, 'learning_rate': 2.5e-05, 'epoch': 1.39}


 48%|████▊     | 260/540 [44:31<40:11,  8.61s/it]

{'loss': 0.0933, 'grad_norm': 1.540124773979187, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.44}


 50%|█████     | 270/540 [45:57<39:19,  8.74s/it]

{'loss': 0.0791, 'grad_norm': 1.3585317134857178, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.5}


 52%|█████▏    | 280/540 [47:25<37:44,  8.71s/it]

{'loss': 0.0838, 'grad_norm': 1.5323821306228638, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.56}


 54%|█████▎    | 290/540 [48:51<35:51,  8.61s/it]

{'loss': 0.0769, 'grad_norm': 1.6705513000488281, 'learning_rate': 2.9e-05, 'epoch': 1.61}


 56%|█████▌    | 300/540 [50:18<34:45,  8.69s/it]

{'loss': 0.0767, 'grad_norm': 1.788601040840149, 'learning_rate': 3e-05, 'epoch': 1.67}


 57%|█████▋    | 310/540 [51:47<33:48,  8.82s/it]

{'loss': 0.0796, 'grad_norm': 1.3999173641204834, 'learning_rate': 3.1e-05, 'epoch': 1.72}


 59%|█████▉    | 320/540 [53:20<34:55,  9.52s/it]

{'loss': 0.0754, 'grad_norm': 1.6627240180969238, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.78}


 61%|██████    | 330/540 [54:48<30:42,  8.77s/it]

{'loss': 0.0877, 'grad_norm': 1.4971076250076294, 'learning_rate': 3.3e-05, 'epoch': 1.83}


 63%|██████▎   | 340/540 [56:18<29:52,  8.96s/it]

{'loss': 0.074, 'grad_norm': 1.6685205698013306, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.89}


 65%|██████▍   | 350/540 [57:47<28:07,  8.88s/it]

{'loss': 0.0809, 'grad_norm': 1.5145922899246216, 'learning_rate': 3.5e-05, 'epoch': 1.94}


 67%|██████▋   | 360/540 [59:18<27:10,  9.06s/it]

{'loss': 0.0708, 'grad_norm': 1.331672191619873, 'learning_rate': 3.6e-05, 'epoch': 2.0}


                                                 
 67%|██████▋   | 360/540 [1:00:25<27:10,  9.06s/it]

{'eval_loss': 0.06848088651895523, 'eval_runtime': 66.9663, 'eval_samples_per_second': 0.597, 'eval_steps_per_second': 0.299, 'epoch': 2.0}


 69%|██████▊   | 370/540 [1:02:00<28:53, 10.20s/it]  

{'loss': 0.064, 'grad_norm': 1.562493085861206, 'learning_rate': 3.7e-05, 'epoch': 2.06}


 70%|███████   | 380/540 [1:03:28<23:32,  8.83s/it]

{'loss': 0.0645, 'grad_norm': 1.2548935413360596, 'learning_rate': 3.8e-05, 'epoch': 2.11}


 72%|███████▏  | 390/540 [1:04:57<22:18,  8.92s/it]

{'loss': 0.0763, 'grad_norm': 1.248025894165039, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.17}


 74%|███████▍  | 400/540 [1:06:25<20:42,  8.88s/it]

{'loss': 0.0612, 'grad_norm': 1.4639606475830078, 'learning_rate': 4e-05, 'epoch': 2.22}


 76%|███████▌  | 410/540 [1:07:54<19:25,  8.96s/it]

{'loss': 0.0641, 'grad_norm': 1.306800365447998, 'learning_rate': 4.1e-05, 'epoch': 2.28}


 78%|███████▊  | 420/540 [1:09:23<17:10,  8.58s/it]

{'loss': 0.0684, 'grad_norm': 1.316096305847168, 'learning_rate': 4.2e-05, 'epoch': 2.33}


 80%|███████▉  | 430/540 [1:10:51<16:08,  8.80s/it]

{'loss': 0.06, 'grad_norm': 1.5085415840148926, 'learning_rate': 4.3e-05, 'epoch': 2.39}


 81%|████████▏ | 440/540 [1:12:18<14:26,  8.67s/it]

{'loss': 0.0734, 'grad_norm': 1.4878418445587158, 'learning_rate': 4.4000000000000006e-05, 'epoch': 2.44}


 83%|████████▎ | 450/540 [1:13:47<13:12,  8.80s/it]

{'loss': 0.0713, 'grad_norm': 1.7157407999038696, 'learning_rate': 4.5e-05, 'epoch': 2.5}


 85%|████████▌ | 460/540 [1:15:17<11:45,  8.82s/it]

{'loss': 0.0637, 'grad_norm': 1.7784125804901123, 'learning_rate': 4.600000000000001e-05, 'epoch': 2.56}


 87%|████████▋ | 470/540 [1:16:46<10:13,  8.76s/it]

{'loss': 0.0665, 'grad_norm': 1.3821359872817993, 'learning_rate': 4.7e-05, 'epoch': 2.61}


 89%|████████▉ | 480/540 [1:18:15<08:54,  8.91s/it]

{'loss': 0.0648, 'grad_norm': 1.3174395561218262, 'learning_rate': 4.8e-05, 'epoch': 2.67}


 91%|█████████ | 490/540 [1:19:44<07:20,  8.82s/it]

{'loss': 0.0678, 'grad_norm': 1.6454460620880127, 'learning_rate': 4.9e-05, 'epoch': 2.72}


 93%|█████████▎| 500/540 [1:21:13<05:56,  8.90s/it]

{'loss': 0.058, 'grad_norm': 1.4554704427719116, 'learning_rate': 5e-05, 'epoch': 2.78}


 94%|█████████▍| 510/540 [1:22:45<04:26,  8.87s/it]

{'loss': 0.0601, 'grad_norm': 1.1444480419158936, 'learning_rate': 3.7500000000000003e-05, 'epoch': 2.83}


 96%|█████████▋| 520/540 [1:24:13<02:54,  8.71s/it]

{'loss': 0.0643, 'grad_norm': 1.5912991762161255, 'learning_rate': 2.5e-05, 'epoch': 2.89}


 98%|█████████▊| 530/540 [1:25:41<01:27,  8.79s/it]

{'loss': 0.0598, 'grad_norm': 1.4543975591659546, 'learning_rate': 1.25e-05, 'epoch': 2.94}


100%|██████████| 540/540 [1:27:10<00:00,  8.75s/it]

{'loss': 0.0543, 'grad_norm': 1.0225006341934204, 'learning_rate': 0.0, 'epoch': 3.0}


                                                   
100%|██████████| 540/540 [1:28:15<00:00,  9.81s/it]

{'eval_loss': 0.06480729579925537, 'eval_runtime': 65.5515, 'eval_samples_per_second': 0.61, 'eval_steps_per_second': 0.305, 'epoch': 3.0}
{'train_runtime': 5295.6435, 'train_samples_per_second': 0.204, 'train_steps_per_second': 0.102, 'train_loss': 0.549343987637096, 'epoch': 3.0}





TrainOutput(global_step=540, training_loss=0.549343987637096, metrics={'train_runtime': 5295.6435, 'train_samples_per_second': 0.204, 'train_steps_per_second': 0.102, 'train_loss': 0.549343987637096, 'epoch': 3.0})

In [103]:
# Save the fine-tuned model
model_path = './fine_tuned_gpt2'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('./fine_tuned_gpt2\\tokenizer_config.json',
 './fine_tuned_gpt2\\special_tokens_map.json',
 './fine_tuned_gpt2\\vocab.json',
 './fine_tuned_gpt2\\merges.txt',
 './fine_tuned_gpt2\\added_tokens.json')