In [1]:
from datasets import load_dataset

dataset  = load_dataset("csv",data_files="./Datasets/dataset_set1/job_postings.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['job_link', 'last_processed_time', 'last_status', 'got_summary', 'got_ner', 'is_being_worked', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'search_position', 'job_level', 'job_type'],
        num_rows: 12217
    })
})

In [3]:
from transformers import MT5Tokenizer

In [4]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
import torch

In [6]:
print(torch.__version__)

2.4.1+cu121


In [7]:
def tokenize(data):
    input_text = [
        f"Job Title: {job_title} | "
        f"Company: {company} | "
        f"Location: {job_location} | "
        f"First Seen: {first_seen} | "
        f"Search City: {search_city} | "
        f"Search Country: {search_country} | "
        f"Search Position: {search_position} | "
        f"Job Level: {job_level} | "
        f"Job Type: {job_type}"
        for job_title, company, job_location, first_seen, search_city, search_country, search_position, job_level, job_type
        in zip(data['job_title'], data['company'], data['job_location'], data['first_seen'], data['search_city'], data['search_country'], data['search_position'], data['job_level'], data['job_type'])
    ]
    
    # Tokenize input text
    input_feature = tokenizer(input_text, truncation=True, max_length=512, padding="max_length")
    
    # Tokenize labels
    labels = tokenizer(data['job_title'], truncation=True, max_length=512, padding="max_length")
    
    # Return the tokenized inputs and labels
    return {
        "input_ids": input_feature['input_ids'],
        "attention_mask": input_feature['attention_mask'],
        "labels": labels['input_ids']
    }

In [8]:
tokenized_dataset = dataset.map(tokenize,batched=True)

In [9]:
from transformers import (
    MT5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    MT5Config
)

In [10]:
device  = torch.device("cuda")

In [11]:
config = MT5Config.from_pretrained(
    "google/mt5-small",
    max_length = 512,
    length_penalty = 0.6,
    no_repeat_ngram_size = 2,
    num_beams = 15
)

In [12]:
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small",config=config).to(device)

In [13]:
for param in model.parameters():
    param.data = param.data.contiguous()

In [14]:
from transformers import DataCollatorForSeq2Seq

In [15]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    return_tensors='pt',
    padding=True
)

In [16]:
training_args = Seq2SeqTrainingArguments(
  output_dir = "jobposting",
  log_level = "error",
  num_train_epochs = 7,
  learning_rate = 5e-5,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adafactor",
  weight_decay = 0.01,
  per_device_train_batch_size = 2,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  eval_strategy = "no",
  predict_with_generate=True,
  generation_max_length = 128,
  save_steps = 500,
  logging_steps = 10,
  push_to_hub = False,
  max_steps=850
 )

In [17]:
trainer = Seq2SeqTrainer(
  model = model,
  args = training_args,
  data_collator = data_collator,
  train_dataset = tokenized_dataset["train"],
  tokenizer = tokenizer
 )

trainer.train()

  1%|          | 10/850 [01:34<2:11:33,  9.40s/it]

{'loss': 66.1439, 'grad_norm': 915.1886596679688, 'learning_rate': 5.555555555555556e-06, 'epoch': 0.03}


  2%|▏         | 20/850 [03:10<2:14:04,  9.69s/it]

{'loss': 65.5645, 'grad_norm': 589.9074096679688, 'learning_rate': 1.1111111111111112e-05, 'epoch': 0.05}


  4%|▎         | 30/850 [04:46<2:11:41,  9.64s/it]

{'loss': 64.5476, 'grad_norm': 774.1900634765625, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.08}


  5%|▍         | 40/850 [06:23<2:10:26,  9.66s/it]

{'loss': 62.1484, 'grad_norm': 1202.516845703125, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.1}


  6%|▌         | 50/850 [08:00<2:10:28,  9.79s/it]

{'loss': 59.5994, 'grad_norm': 783.637451171875, 'learning_rate': 2.777777777777778e-05, 'epoch': 0.13}


  7%|▋         | 60/850 [09:40<2:12:47, 10.09s/it]

{'loss': 56.3958, 'grad_norm': 483.0676574707031, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.16}


  8%|▊         | 70/850 [11:22<2:12:06, 10.16s/it]

{'loss': 53.2672, 'grad_norm': 588.841796875, 'learning_rate': 3.888888888888889e-05, 'epoch': 0.18}


  9%|▉         | 80/850 [13:03<2:08:51, 10.04s/it]

{'loss': 49.7284, 'grad_norm': 760.266845703125, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.21}


 11%|█         | 90/850 [15:25<1:56:02,  9.16s/it]

{'loss': 46.0993, 'grad_norm': 1228.015625, 'learning_rate': 5e-05, 'epoch': 0.24}


 12%|█▏        | 100/850 [16:11<1:07:04,  5.37s/it]

{'loss': 42.9863, 'grad_norm': 2190.841552734375, 'learning_rate': 4.9342105263157894e-05, 'epoch': 0.26}


 13%|█▎        | 110/850 [17:16<1:20:29,  6.53s/it]

{'loss': 39.087, 'grad_norm': 823.7040405273438, 'learning_rate': 4.868421052631579e-05, 'epoch': 0.29}


 14%|█▍        | 120/850 [18:22<1:20:16,  6.60s/it]

{'loss': 35.0388, 'grad_norm': 997.8302612304688, 'learning_rate': 4.802631578947368e-05, 'epoch': 0.31}


 15%|█▌        | 130/850 [19:29<1:19:01,  6.59s/it]

{'loss': 32.19, 'grad_norm': 356.2626037597656, 'learning_rate': 4.736842105263158e-05, 'epoch': 0.34}


 16%|█▋        | 140/850 [20:36<1:19:23,  6.71s/it]

{'loss': 29.1273, 'grad_norm': 635.9467163085938, 'learning_rate': 4.671052631578948e-05, 'epoch': 0.37}


 18%|█▊        | 150/850 [21:43<1:17:14,  6.62s/it]

{'loss': 27.0262, 'grad_norm': 706.646484375, 'learning_rate': 4.605263157894737e-05, 'epoch': 0.39}


 19%|█▉        | 160/850 [22:50<1:18:02,  6.79s/it]

{'loss': 25.329, 'grad_norm': 504.3063049316406, 'learning_rate': 4.539473684210527e-05, 'epoch': 0.42}


 20%|██        | 170/850 [23:56<1:14:20,  6.56s/it]

{'loss': 23.5899, 'grad_norm': 292.9106750488281, 'learning_rate': 4.473684210526316e-05, 'epoch': 0.45}


 21%|██        | 180/850 [25:02<1:14:40,  6.69s/it]

{'loss': 22.0603, 'grad_norm': 3482.97412109375, 'learning_rate': 4.407894736842105e-05, 'epoch': 0.47}


 22%|██▏       | 190/850 [26:08<1:12:24,  6.58s/it]

{'loss': 20.9318, 'grad_norm': 754.2952880859375, 'learning_rate': 4.342105263157895e-05, 'epoch': 0.5}


 24%|██▎       | 200/850 [27:13<1:09:38,  6.43s/it]

{'loss': 19.9111, 'grad_norm': 813.4114990234375, 'learning_rate': 4.2763157894736847e-05, 'epoch': 0.52}


 25%|██▍       | 210/850 [28:19<1:09:35,  6.52s/it]

{'loss': 19.0155, 'grad_norm': 512.49267578125, 'learning_rate': 4.210526315789474e-05, 'epoch': 0.55}


 26%|██▌       | 220/850 [29:26<1:10:57,  6.76s/it]

{'loss': 17.9561, 'grad_norm': 250.10504150390625, 'learning_rate': 4.1447368421052636e-05, 'epoch': 0.58}


 27%|██▋       | 230/850 [30:31<1:07:53,  6.57s/it]

{'loss': 16.8271, 'grad_norm': 410.2787780761719, 'learning_rate': 4.078947368421053e-05, 'epoch': 0.6}


 28%|██▊       | 240/850 [31:36<1:04:14,  6.32s/it]

{'loss': 16.1589, 'grad_norm': 672.1450805664062, 'learning_rate': 4.0131578947368425e-05, 'epoch': 0.63}


 29%|██▉       | 250/850 [32:30<52:27,  5.25s/it]  

{'loss': 15.3644, 'grad_norm': 404.4461975097656, 'learning_rate': 3.9473684210526316e-05, 'epoch': 0.65}


 31%|███       | 260/850 [33:22<50:58,  5.18s/it]

{'loss': 14.5038, 'grad_norm': 331.5111083984375, 'learning_rate': 3.8815789473684214e-05, 'epoch': 0.68}


 32%|███▏      | 270/850 [34:14<50:04,  5.18s/it]

{'loss': 13.9698, 'grad_norm': 339.63897705078125, 'learning_rate': 3.815789473684211e-05, 'epoch': 0.71}


 33%|███▎      | 280/850 [35:06<49:11,  5.18s/it]

{'loss': 13.3671, 'grad_norm': 475.5340270996094, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.73}


 34%|███▍      | 290/850 [35:58<48:33,  5.20s/it]

{'loss': 12.5037, 'grad_norm': 160.60617065429688, 'learning_rate': 3.6842105263157895e-05, 'epoch': 0.76}


 35%|███▌      | 300/850 [36:50<47:39,  5.20s/it]

{'loss': 11.7422, 'grad_norm': 249.01382446289062, 'learning_rate': 3.618421052631579e-05, 'epoch': 0.79}


 36%|███▋      | 310/850 [37:50<56:59,  6.33s/it]

{'loss': 11.4701, 'grad_norm': 318.69427490234375, 'learning_rate': 3.5526315789473684e-05, 'epoch': 0.81}


 38%|███▊      | 320/850 [38:56<58:49,  6.66s/it]

{'loss': 11.1191, 'grad_norm': 511.43206787109375, 'learning_rate': 3.4868421052631575e-05, 'epoch': 0.84}


 39%|███▉      | 330/850 [40:00<56:12,  6.49s/it]

{'loss': 10.7724, 'grad_norm': 765.2689819335938, 'learning_rate': 3.421052631578947e-05, 'epoch': 0.86}


 40%|████      | 340/850 [41:06<56:44,  6.68s/it]

{'loss': 10.1541, 'grad_norm': 367.3535461425781, 'learning_rate': 3.355263157894737e-05, 'epoch': 0.89}


 41%|████      | 350/850 [42:12<54:05,  6.49s/it]

{'loss': 9.9019, 'grad_norm': 279.69781494140625, 'learning_rate': 3.289473684210527e-05, 'epoch': 0.92}


 42%|████▏     | 360/850 [43:17<52:56,  6.48s/it]

{'loss': 9.4106, 'grad_norm': 410.31927490234375, 'learning_rate': 3.223684210526316e-05, 'epoch': 0.94}


 44%|████▎     | 370/850 [44:23<52:54,  6.61s/it]

{'loss': 9.1088, 'grad_norm': 125.10287475585938, 'learning_rate': 3.157894736842105e-05, 'epoch': 0.97}


 45%|████▍     | 380/850 [45:30<51:37,  6.59s/it]

{'loss': 8.7213, 'grad_norm': 233.211181640625, 'learning_rate': 3.092105263157895e-05, 'epoch': 1.0}


 46%|████▌     | 390/850 [47:35<1:43:46, 13.54s/it]

{'loss': 8.3762, 'grad_norm': 204.47799682617188, 'learning_rate': 3.0263157894736844e-05, 'epoch': 1.02}


 47%|████▋     | 400/850 [49:56<1:46:06, 14.15s/it]

{'loss': 8.0527, 'grad_norm': 350.72857666015625, 'learning_rate': 2.9605263157894735e-05, 'epoch': 1.05}


 48%|████▊     | 410/850 [1:05:24<11:33:28, 94.56s/it] 

{'loss': 7.6647, 'grad_norm': 270.46539306640625, 'learning_rate': 2.8947368421052634e-05, 'epoch': 1.07}


 49%|████▉     | 420/850 [1:07:29<1:46:05, 14.80s/it] 

{'loss': 7.4362, 'grad_norm': 176.18353271484375, 'learning_rate': 2.8289473684210528e-05, 'epoch': 1.1}


 51%|█████     | 430/850 [1:09:34<1:27:53, 12.56s/it]

{'loss': 7.162, 'grad_norm': 123.52598571777344, 'learning_rate': 2.7631578947368426e-05, 'epoch': 1.13}


 52%|█████▏    | 440/850 [1:14:53<2:19:54, 20.47s/it]

{'loss': 7.0132, 'grad_norm': 195.9721221923828, 'learning_rate': 2.6973684210526317e-05, 'epoch': 1.15}


 53%|█████▎    | 450/850 [1:17:15<1:36:24, 14.46s/it]

{'loss': 6.7247, 'grad_norm': 536.0257568359375, 'learning_rate': 2.6315789473684212e-05, 'epoch': 1.18}


 54%|█████▍    | 460/850 [1:19:38<1:32:21, 14.21s/it]

{'loss': 6.5464, 'grad_norm': 203.81809997558594, 'learning_rate': 2.565789473684211e-05, 'epoch': 1.2}


 55%|█████▌    | 470/850 [2:11:20<23:54:43, 226.54s/it]

{'loss': 6.4049, 'grad_norm': 208.16595458984375, 'learning_rate': 2.5e-05, 'epoch': 1.23}


 56%|█████▋    | 480/850 [2:13:25<1:54:46, 18.61s/it]  

{'loss': 6.3727, 'grad_norm': 354.71563720703125, 'learning_rate': 2.4342105263157896e-05, 'epoch': 1.26}


 58%|█████▊    | 490/850 [2:15:32<1:16:29, 12.75s/it]

{'loss': 6.1926, 'grad_norm': 809.0048217773438, 'learning_rate': 2.368421052631579e-05, 'epoch': 1.28}


 59%|█████▉    | 500/850 [2:19:12<1:22:44, 14.19s/it]

{'loss': 5.9646, 'grad_norm': 261.3091125488281, 'learning_rate': 2.3026315789473685e-05, 'epoch': 1.31}


 60%|██████    | 510/850 [2:21:19<1:11:26, 12.61s/it]

{'loss': 5.8703, 'grad_norm': 180.155517578125, 'learning_rate': 2.236842105263158e-05, 'epoch': 1.34}


 61%|██████    | 520/850 [2:23:27<1:10:20, 12.79s/it]

{'loss': 5.839, 'grad_norm': 305.6832275390625, 'learning_rate': 2.1710526315789474e-05, 'epoch': 1.36}


 62%|██████▏   | 530/850 [2:25:33<1:07:16, 12.61s/it]

{'loss': 5.6965, 'grad_norm': 221.2583465576172, 'learning_rate': 2.105263157894737e-05, 'epoch': 1.39}


 64%|██████▎   | 540/850 [2:27:39<1:05:00, 12.58s/it]

{'loss': 5.7323, 'grad_norm': 92.04010772705078, 'learning_rate': 2.0394736842105264e-05, 'epoch': 1.41}


 65%|██████▍   | 550/850 [2:38:40<10:25:50, 125.17s/it]

{'loss': 5.5257, 'grad_norm': 128.06019592285156, 'learning_rate': 1.9736842105263158e-05, 'epoch': 1.44}


 66%|██████▌   | 560/850 [2:40:51<1:18:43, 16.29s/it]  

{'loss': 5.6393, 'grad_norm': 273.4971923828125, 'learning_rate': 1.9078947368421056e-05, 'epoch': 1.47}


 67%|██████▋   | 570/850 [2:43:04<1:02:50, 13.47s/it]

{'loss': 5.3247, 'grad_norm': 152.48362731933594, 'learning_rate': 1.8421052631578947e-05, 'epoch': 1.49}


 68%|██████▊   | 580/850 [2:45:25<1:03:31, 14.12s/it]

{'loss': 5.3534, 'grad_norm': 131.63577270507812, 'learning_rate': 1.7763157894736842e-05, 'epoch': 1.52}


 69%|██████▉   | 590/850 [2:47:47<1:02:27, 14.41s/it]

{'loss': 5.2199, 'grad_norm': 261.5735778808594, 'learning_rate': 1.7105263157894737e-05, 'epoch': 1.55}


 71%|███████   | 600/850 [2:50:09<58:32, 14.05s/it]  

{'loss': 5.1166, 'grad_norm': 121.87931060791016, 'learning_rate': 1.6447368421052635e-05, 'epoch': 1.57}


 72%|███████▏  | 610/850 [2:52:30<56:15, 14.07s/it]

{'loss': 5.1072, 'grad_norm': 1062.4971923828125, 'learning_rate': 1.5789473684210526e-05, 'epoch': 1.6}


 73%|███████▎  | 620/850 [2:54:49<53:37, 13.99s/it]

{'loss': 4.9669, 'grad_norm': 66.48320770263672, 'learning_rate': 1.5131578947368422e-05, 'epoch': 1.62}


 74%|███████▍  | 630/850 [2:57:10<51:44, 14.11s/it]

{'loss': 4.9835, 'grad_norm': 92.6869125366211, 'learning_rate': 1.4473684210526317e-05, 'epoch': 1.65}


 75%|███████▌  | 640/850 [2:59:32<49:43, 14.21s/it]

{'loss': 4.8705, 'grad_norm': 122.13125610351562, 'learning_rate': 1.3815789473684213e-05, 'epoch': 1.68}


 76%|███████▋  | 650/850 [3:01:56<48:08, 14.44s/it]

{'loss': 4.754, 'grad_norm': 104.89081573486328, 'learning_rate': 1.3157894736842106e-05, 'epoch': 1.7}


 78%|███████▊  | 660/850 [3:04:23<46:43, 14.75s/it]

{'loss': 4.7981, 'grad_norm': 70.44374084472656, 'learning_rate': 1.25e-05, 'epoch': 1.73}


 79%|███████▉  | 670/850 [3:06:57<46:13, 15.41s/it]

{'loss': 4.6433, 'grad_norm': 152.33326721191406, 'learning_rate': 1.1842105263157895e-05, 'epoch': 1.75}


 80%|████████  | 680/850 [3:09:30<43:29, 15.35s/it]

{'loss': 4.5633, 'grad_norm': 113.3302001953125, 'learning_rate': 1.118421052631579e-05, 'epoch': 1.78}


 81%|████████  | 690/850 [3:12:13<44:08, 16.55s/it]

{'loss': 4.5701, 'grad_norm': 147.31155395507812, 'learning_rate': 1.0526315789473684e-05, 'epoch': 1.81}


 82%|████████▏ | 700/850 [3:14:58<41:39, 16.66s/it]

{'loss': 4.573, 'grad_norm': 167.2513885498047, 'learning_rate': 9.868421052631579e-06, 'epoch': 1.83}


 84%|████████▎ | 710/850 [3:17:42<38:16, 16.40s/it]

{'loss': 4.4794, 'grad_norm': 697.8075561523438, 'learning_rate': 9.210526315789474e-06, 'epoch': 1.86}


 85%|████████▍ | 720/850 [3:20:27<35:31, 16.39s/it]

{'loss': 4.4488, 'grad_norm': 159.74681091308594, 'learning_rate': 8.552631578947368e-06, 'epoch': 1.89}


 86%|████████▌ | 730/850 [3:23:21<34:53, 17.44s/it]

{'loss': 4.4321, 'grad_norm': 99.47869873046875, 'learning_rate': 7.894736842105263e-06, 'epoch': 1.91}


 87%|████████▋ | 740/850 [3:26:14<31:13, 17.03s/it]

{'loss': 4.3322, 'grad_norm': 325.9442443847656, 'learning_rate': 7.236842105263158e-06, 'epoch': 1.94}


 88%|████████▊ | 750/850 [3:29:11<29:34, 17.75s/it]

{'loss': 4.3341, 'grad_norm': 844.70947265625, 'learning_rate': 6.578947368421053e-06, 'epoch': 1.96}


 89%|████████▉ | 760/850 [3:32:23<29:49, 19.88s/it]

{'loss': 4.3648, 'grad_norm': 113.9990005493164, 'learning_rate': 5.921052631578948e-06, 'epoch': 1.99}


 91%|█████████ | 770/850 [3:35:40<26:37, 19.97s/it]

{'loss': 4.3025, 'grad_norm': 141.80380249023438, 'learning_rate': 5.263157894736842e-06, 'epoch': 2.02}


 92%|█████████▏| 780/850 [3:38:18<15:53, 13.62s/it]

{'loss': 4.3368, 'grad_norm': 291.4134216308594, 'learning_rate': 4.605263157894737e-06, 'epoch': 2.04}


 93%|█████████▎| 790/850 [3:40:23<12:36, 12.61s/it]

{'loss': 4.365, 'grad_norm': 210.81004333496094, 'learning_rate': 3.9473684210526315e-06, 'epoch': 2.07}


 94%|█████████▍| 800/850 [3:42:47<13:04, 15.69s/it]

{'loss': 4.1888, 'grad_norm': 100.61246490478516, 'learning_rate': 3.2894736842105265e-06, 'epoch': 2.1}


 95%|█████████▌| 810/850 [3:45:05<09:20, 14.00s/it]

{'loss': 4.2454, 'grad_norm': 67.93109130859375, 'learning_rate': 2.631578947368421e-06, 'epoch': 2.12}


 96%|█████████▋| 820/850 [3:47:32<07:20, 14.68s/it]

{'loss': 4.1922, 'grad_norm': 445.71881103515625, 'learning_rate': 1.9736842105263157e-06, 'epoch': 2.15}


 98%|█████████▊| 830/850 [3:49:56<05:10, 15.50s/it]

{'loss': 4.1922, 'grad_norm': 204.5904083251953, 'learning_rate': 1.3157894736842106e-06, 'epoch': 2.17}


 99%|█████████▉| 840/850 [3:52:20<02:24, 14.41s/it]

{'loss': 4.212, 'grad_norm': 765.9496459960938, 'learning_rate': 6.578947368421053e-07, 'epoch': 2.2}


100%|██████████| 850/850 [3:54:44<00:00, 14.52s/it]

{'loss': 4.209, 'grad_norm': 140.33139038085938, 'learning_rate': 0.0, 'epoch': 2.23}


100%|██████████| 850/850 [3:54:53<00:00, 16.58s/it]

{'train_runtime': 14093.0813, 'train_samples_per_second': 1.93, 'train_steps_per_second': 0.06, 'train_loss': 15.582739186006433, 'epoch': 2.23}





TrainOutput(global_step=850, training_loss=15.582739186006433, metrics={'train_runtime': 14093.0813, 'train_samples_per_second': 1.93, 'train_steps_per_second': 0.06, 'total_flos': 1.438093409255424e+16, 'train_loss': 15.582739186006433, 'epoch': 2.2262236045179242})

In [18]:
model.save_pretrained("D:/new_finetuned_model")
tokenizer.save_pretrained("D:/new_finetuned_model")

('D:/new_finetuned_model\\tokenizer_config.json',
 'D:/new_finetuned_model\\special_tokens_map.json',
 'D:/new_finetuned_model\\spiece.model',
 'D:/new_finetuned_model\\added_tokens.json')