In [2]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

In [3]:
case = 'ag_news'
train_and_valid = load_dataset(case, split='train')
test = load_dataset(case, split='test')

print("Train + validation dataset shape: {}".format(train_and_valid.shape))
print("Test dataset shape: {}".format(test.shape))

Using custom data configuration default
Reusing dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)
Using custom data configuration default
Reusing dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


Train + validation dataset shape: (120000, 2)
Test dataset shape: (7600, 2)


## Convert to pandas

In [4]:
train_and_valid_df = pd.DataFrame.from_dict(train_and_valid)[['text', 'label']]
train_and_valid_df.columns = ['text', 'labels']   # Rename per simpletransformers format
test_df = pd.DataFrame.from_dict(test)[['text', 'label']]
test_df.columns = ['text', 'labels']    # Rename per simpletransformers format

In [6]:
# Stratify and sample to produce train/validation data
train_df, valid_df = train_test_split(
    train_and_valid_df, stratify=train_and_valid_df['labels'], test_size=0.1, random_state=345246
)

print("Train + valid dataset shape: {}".format(train_and_valid_df.shape))
print("Train dataset shape: {}".format(train_df.shape))
print("Valid dataset shape: {}".format(valid_df.shape))
print("Test dataset shape: {}".format(test_df.shape))

Train + valid dataset shape: (120000, 2)
Train dataset shape: (108000, 2)
Valid dataset shape: (12000, 2)
Test dataset shape: (7600, 2)


In [7]:
test_df.head()

Unnamed: 0,text,labels
0,Fears for T N pension after talks Unions repre...,2
1,The Race is On: Second Private Team Sets Launc...,3
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3


In [8]:
from simpletransformers.classification import ClassificationModel

args = {
    "fp16": True, # floating point 16 - by default, fp 64. We're reducing the size to 16 bits for perf+speed. #tradeoff
    "fp16_opt_level": "O1",
    "max_seq_length": 512,  # start small. push it up to 512 if possible. If problem, reduce number of samples instead of truncating (do Bert 256)
    "train_batch_size": 16, # try increasing, but not too much (might conflict w padding & seq length)
    "gradient_accumulation_steps": 1, # trick to reduce GPU mem usage (increase to 2-4 steps to improve test accuracy, but might cost more memory)
    "eval_batch_size": 16,
    "num_train_epochs": 1,
    "weight_decay": 0.01,  # L2 regularization
    "learning_rate": 2e-5,  # keep learning rate low, similar to base BERT best practices
    "adam_epsilon": 1e-8,
    "warmup_ratio": 0.1,
    "warmup_steps": 0,
    "max_grad_norm": 1.0,
    'do_lower_case': False,
    'sliding_window': False,
    'evaluate_during_training': True,
    'overwrite_output_dir': True,
    'reprocess_input_data': True,
    'save_model_every_epoch': True,
    'save_optimizer_and_scheduler': True,        
}

# Create a ClassificationModel
#To load a model a previously saved model instead of a default model, 
#you can change the model_name to the path to a directory which contains a saved model.
model = ClassificationModel('distilbert', 'distilbert-base-cased', use_cuda=True, num_labels=4, args=args)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

In [9]:
model.train_model(train_df=train_df, eval_df=valid_df, show_running_loss=True)

HBox(children=(FloatProgress(value=0.0, max=108000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=6750.0, style=ProgressStyle(de…







(6750,
 {'eval_loss': [0.25048831548541783,
   0.20446868675698837,
   0.19322797400504352,
   0.18813817842894545],
  'global_step': [2000, 4000, 6000, 6750],
  'mcc': [0.8891867885790828,
   0.9121390415740205,
   0.9186021077474907,
   0.9208254658351251],
  'train_loss': [0.516880214214325,
   0.02032960206270218,
   0.5880517363548279,
   0.24262532591819763]})

In [10]:
# Try reloading saved model
model = ClassificationModel('distilbert', 'outputs/', use_cuda=True, args=args)

In [11]:
predictions, raw_outputs = model.predict(["Chemicals company showcases new scientific breakthroughs."])
print(predictions, raw_outputs)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


[3] [[ 0.12524414 -3.85351562  1.62207031  1.72753906]]


In [12]:
result, model_outputs, _ = model.eval_model(test_df)
print(result, model_outputs)

HBox(children=(FloatProgress(value=0.0, max=7600.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=475.0, style=ProgressStyle(descr…


{'mcc': 0.9167094559044499, 'eval_loss': 0.20068686818017772} [[-0.18005371 -3.09765625  4.6328125  -1.66015625]
 [-1.02929688 -3.02734375 -1.9375      4.70703125]
 [-0.91601562 -2.31445312 -1.99609375  4.16796875]
 ...
 [-1.60644531  5.57421875 -1.75097656 -2.61523438]
 [ 0.85400391 -2.98242188  3.47851562 -1.60351562]
 [-2.75976562 -4.0234375   2.25390625  2.89257812]]


In [14]:
model_save_name = 'distilbert_ag-news_120k.pt'
path = f"./{model_save_name}" 
torch.save(model, path)