## Text Classification using Transformers and Hugging Face

### This project we're going to train a model to classify text data into predefined categories using hugging face's transformers library

In [1]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('ag_news')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

# Load pre-trained tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

def preprocess_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply the tokenizer to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 120000/120000 [01:32<00:00, 1293.84 examples/s]
Map: 100%|██████████| 7600/7600 [00:05<00:00, 1299.69 examples/s]


In [3]:
# Define training arguments with reduced batch size
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,  # Reduced batch size
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

# Train the model
trainer.train()


  1%|          | 500/45000 [01:59<2:53:45,  4.27it/s]

{'loss': 0.465, 'grad_norm': 8.757072448730469, 'learning_rate': 1.977777777777778e-05, 'epoch': 0.03}


  2%|▏         | 1000/45000 [04:04<3:00:47,  4.06it/s]

{'loss': 0.3499, 'grad_norm': 0.38300976157188416, 'learning_rate': 1.9555555555555557e-05, 'epoch': 0.07}


  3%|▎         | 1500/45000 [06:10<2:53:46,  4.17it/s] 

{'loss': 0.3179, 'grad_norm': 14.141056060791016, 'learning_rate': 1.9333333333333333e-05, 'epoch': 0.1}


  4%|▍         | 2000/45000 [08:15<2:44:00,  4.37it/s] 

{'loss': 0.2989, 'grad_norm': 0.12236113101243973, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.13}


  6%|▌         | 2500/45000 [10:20<2:43:41,  4.33it/s] 

{'loss': 0.2849, 'grad_norm': 12.095422744750977, 'learning_rate': 1.888888888888889e-05, 'epoch': 0.17}


  7%|▋         | 3000/45000 [12:27<2:50:56,  4.09it/s] 

{'loss': 0.2726, 'grad_norm': 4.906160831451416, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.2}


  8%|▊         | 3500/45000 [14:30<2:49:19,  4.08it/s]

{'loss': 0.2719, 'grad_norm': 14.890785217285156, 'learning_rate': 1.8444444444444448e-05, 'epoch': 0.23}


  9%|▉         | 4000/45000 [16:31<2:37:43,  4.33it/s]

{'loss': 0.2471, 'grad_norm': 5.306872844696045, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.27}


 10%|█         | 4500/45000 [18:29<2:42:14,  4.16it/s]

{'loss': 0.2816, 'grad_norm': 0.19968682527542114, 'learning_rate': 1.8e-05, 'epoch': 0.3}


 11%|█         | 5000/45000 [20:30<2:40:08,  4.16it/s]

{'loss': 0.2587, 'grad_norm': 0.2609948217868805, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.33}


 12%|█▏        | 5500/45000 [22:31<2:36:19,  4.21it/s]

{'loss': 0.2801, 'grad_norm': 9.425701141357422, 'learning_rate': 1.7555555555555556e-05, 'epoch': 0.37}


 13%|█▎        | 6000/45000 [24:32<2:30:49,  4.31it/s]

{'loss': 0.2803, 'grad_norm': 37.63977813720703, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}


 14%|█▍        | 6500/45000 [26:29<2:49:55,  3.78it/s]

{'loss': 0.2197, 'grad_norm': 2.0378081798553467, 'learning_rate': 1.7111111111111112e-05, 'epoch': 0.43}


 16%|█▌        | 7000/45000 [28:32<2:37:13,  4.03it/s]

{'loss': 0.2585, 'grad_norm': 4.7588419914245605, 'learning_rate': 1.688888888888889e-05, 'epoch': 0.47}


 17%|█▋        | 7500/45000 [30:40<2:32:31,  4.10it/s]

{'loss': 0.2344, 'grad_norm': 2.234173536300659, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.5}


 18%|█▊        | 8000/45000 [32:46<2:23:19,  4.30it/s]

{'loss': 0.2538, 'grad_norm': 13.381269454956055, 'learning_rate': 1.6444444444444444e-05, 'epoch': 0.53}


 19%|█▉        | 8500/45000 [34:43<2:24:57,  4.20it/s]

{'loss': 0.2364, 'grad_norm': 0.23759059607982635, 'learning_rate': 1.6222222222222223e-05, 'epoch': 0.57}


 20%|██        | 9000/45000 [36:41<2:19:23,  4.30it/s]

{'loss': 0.2432, 'grad_norm': 0.25458353757858276, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.6}


 21%|██        | 9500/45000 [38:37<2:05:32,  4.71it/s]

{'loss': 0.2235, 'grad_norm': 0.2344091385602951, 'learning_rate': 1.577777777777778e-05, 'epoch': 0.63}


 22%|██▏       | 10000/45000 [40:36<2:15:09,  4.32it/s]

{'loss': 0.2319, 'grad_norm': 3.503018856048584, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.67}


 23%|██▎       | 10500/45000 [42:34<2:06:22,  4.55it/s]

{'loss': 0.2268, 'grad_norm': 2.3066134452819824, 'learning_rate': 1.5333333333333334e-05, 'epoch': 0.7}


 24%|██▍       | 11000/45000 [44:26<2:04:33,  4.55it/s]

{'loss': 0.2338, 'grad_norm': 0.8676392436027527, 'learning_rate': 1.5111111111111112e-05, 'epoch': 0.73}


 26%|██▌       | 11500/45000 [46:17<2:03:16,  4.53it/s]

{'loss': 0.2325, 'grad_norm': 0.07894893735647202, 'learning_rate': 1.488888888888889e-05, 'epoch': 0.77}


 27%|██▋       | 12000/45000 [48:09<2:05:31,  4.38it/s]

{'loss': 0.2161, 'grad_norm': 0.32595717906951904, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}


 28%|██▊       | 12500/45000 [50:01<1:59:01,  4.55it/s]

{'loss': 0.2296, 'grad_norm': 0.6080320477485657, 'learning_rate': 1.4444444444444446e-05, 'epoch': 0.83}


 29%|██▉       | 13000/45000 [51:52<1:56:48,  4.57it/s]

{'loss': 0.2293, 'grad_norm': 10.558318138122559, 'learning_rate': 1.4222222222222224e-05, 'epoch': 0.87}


 30%|███       | 13500/45000 [53:44<1:59:20,  4.40it/s]

{'loss': 0.2315, 'grad_norm': 9.02513313293457, 'learning_rate': 1.4e-05, 'epoch': 0.9}


 31%|███       | 14000/45000 [55:38<1:59:17,  4.33it/s]

{'loss': 0.198, 'grad_norm': 1.5388075113296509, 'learning_rate': 1.377777777777778e-05, 'epoch': 0.93}


 32%|███▏      | 14500/45000 [57:34<1:56:43,  4.35it/s]

{'loss': 0.2221, 'grad_norm': 0.11365290731191635, 'learning_rate': 1.3555555555555557e-05, 'epoch': 0.97}


 33%|███▎      | 15000/45000 [59:31<1:59:19,  4.19it/s]

{'loss': 0.2494, 'grad_norm': 4.64934778213501, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


                                                       
 33%|███▎      | 15000/45000 [1:00:39<1:59:19,  4.19it/s]

{'eval_loss': 0.20673400163650513, 'eval_runtime': 66.7841, 'eval_samples_per_second': 113.8, 'eval_steps_per_second': 14.225, 'epoch': 1.0}


 34%|███▍      | 15500/45000 [1:02:34<1:54:19,  4.30it/s]  

{'loss': 0.1596, 'grad_norm': 0.14244814217090607, 'learning_rate': 1.3111111111111113e-05, 'epoch': 1.03}


 36%|███▌      | 16000/45000 [1:04:30<1:50:41,  4.37it/s]

{'loss': 0.1782, 'grad_norm': 7.563416481018066, 'learning_rate': 1.288888888888889e-05, 'epoch': 1.07}


 37%|███▋      | 16500/45000 [1:06:24<1:44:43,  4.54it/s]

{'loss': 0.16, 'grad_norm': 0.23003074526786804, 'learning_rate': 1.2666666666666667e-05, 'epoch': 1.1}


 38%|███▊      | 17000/45000 [1:08:19<1:46:06,  4.40it/s]

{'loss': 0.1847, 'grad_norm': 5.528477191925049, 'learning_rate': 1.2444444444444446e-05, 'epoch': 1.13}


 39%|███▉      | 17500/45000 [1:10:16<1:44:38,  4.38it/s]

{'loss': 0.1516, 'grad_norm': 0.0313040092587471, 'learning_rate': 1.2222222222222224e-05, 'epoch': 1.17}


 40%|████      | 18000/45000 [1:12:15<1:44:38,  4.30it/s]

{'loss': 0.165, 'grad_norm': 8.997964859008789, 'learning_rate': 1.2e-05, 'epoch': 1.2}


 41%|████      | 18500/45000 [1:14:13<1:44:12,  4.24it/s]

{'loss': 0.1659, 'grad_norm': 1.8736704587936401, 'learning_rate': 1.177777777777778e-05, 'epoch': 1.23}


RuntimeError: [enforce fail at inline_container.cc:595] . unexpected pos 394155520 vs 394155412

In [None]:
# from transformers import EarlyStoppingCallback

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy="epoch",
#     logging_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     load_best_model_at_end=True,
#     metric_for_best_model="accuracy",
# )

# # Initialize Trainer with early stopping
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset['train'],
#     eval_dataset=tokenized_dataset['test'],
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
# )

# # Train the model
# trainer.train()


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  1%|          | 220/22500 [21:55<235:08:11, 37.99s/it]

KeyboardInterrupt: 

In [None]:
# Evaluate the model
results = trainer.evaluate()

print(f"Evaluation results: {results}")
