In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
df_train = pd.read_csv('data/drugsComTest_train.csv')
df_test = pd.read_csv('data/drugsComTest_test.csv')

In [3]:
#can experiment with more processing
import re
df_train['review'] = df_train['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))
df_test['review'] = df_test['review'].apply(lambda x: re.sub(r'[^A-Za-z0-9 ]+', '', x.lower()))

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(df_train['condition'].values)
test_y = le.transform(df_test['condition'].values)

In [5]:
df_train['text'] = df_train['review'] 
df_train['label'] = train_y
df_train = df_train[['text', 'label']]
df_test['text'] = df_test['review'] 
df_test['label'] = test_y
df_test = df_test[['text', 'label']]

In [6]:
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [8]:
train_dataset[0]

{'text': 'i039ve been taking it for a few years so far it039s done what it039s supposed to only side effect i039ve experienced is an increase in yeast infections about one a year since i started taking it',
 'label': 3}

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [11]:
ds_train = train_dataset.map(preprocess_function, batched = True)
ds_test = test_dataset.map(preprocess_function, batched = True)

100%|██████████| 16/16 [00:01<00:00,  8.79ba/s]
100%|██████████| 4/4 [00:00<00:00,  9.48ba/s]


In [12]:
n_output = 6

In [13]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=n_output)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [14]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [15]:
from transformers import Trainer, TrainingArguments
batch_size = 32
training_args = TrainingArguments(
    output_dir='./ourput',
    num_train_epochs=3,
    learning_rate=2e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False
)
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    tokenizer=tokenizer
)

2022-11-13 17:02:13.584249: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-13 17:02:13.804334: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-13 17:02:14.803218: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-11-13 17:02:14.803367: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 15982
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1500
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5339,1.530867,0.477597,0.308743
2,1.515,1.518974,0.477597,0.308743
3,1.5116,1.517979,0.477597,0.308743


Saving model checkpoint to ./ourput/checkpoint-500
Configuration saved in ./ourput/checkpoint-500/config.json
Model weights saved in ./ourput/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./ourput/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./ourput/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3995
  Batch size = 32
Saving model checkpoint to ./ourput/checkpoint-1000
Configuration saved in ./ourput/checkpoint-1000/config.json
Model weights saved in ./ourput/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./ourput/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./ourput/checkpoint-1000/special_tokens_map.j

TrainOutput(global_step=1500, training_loss=1.5201751302083333, metrics={'train_runtime': 867.6814, 'train_samples_per_second': 55.258, 'train_steps_per_second': 1.729, 'total_flos': 2327035012984584.0, 'train_loss': 1.5201751302083333, 'epoch': 3.0})

In [17]:
preds_output = trainer.predict(ds_test)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3995
  Batch size = 32


In [18]:
preds_output.metrics

{'test_loss': 1.517978549003601,
 'test_accuracy': 0.47759699624530666,
 'test_f1': 0.3087430353501762,
 'test_runtime': 22.0143,
 'test_samples_per_second': 181.473,
 'test_steps_per_second': 5.678}