Transformers. Updated versions, 2, 3, and 4 with weight balancing, focal loss, and SMOTE



In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset

data_path = './data/diabetes-dataset/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(data_path)

print("Column Names:", df.columns)

X = df.drop(columns=['Diabetes_012']) 
y = df[['Diabetes_012']] 


print("Target Values Distribution:", y['Diabetes_012'].value_counts())


  from .autonotebook import tqdm as notebook_tqdm


Column Names: Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
Target Values Distribution: Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64


checking columns names

In [2]:
print(X.columns)
print(y.columns)


Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
Index(['Diabetes_012'], dtype='object')


preprocess data

In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

data_path = './data/diabetes-dataset/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(data_path)

print("Column Names:", df.columns)

X = df.drop(columns=['Diabetes_012'])
y = df[['Diabetes_012']]  

print("Target Values Distribution:", y['Diabetes_012'].value_counts())

tokenizer = DistilBertTokenizerFast.from_pretrained('./saved_model')

class DiabetesDataset(Dataset):
    def __init__(self, features_df, target_df, tokenizer, target_col):
        self.features_df = features_df
        self.labels = torch.tensor(target_df[target_col].values)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.features_df)

    def __getitem__(self, idx):
        features = self.features_df.iloc[idx]
        text = " ".join([str(val) for val in features])
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        label = self.labels[idx].item()
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

target_col = 'Diabetes_012'
diabetes_dataset = DiabetesDataset(X, y, tokenizer, target_col)

print(diabetes_dataset[0])

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('./saved_model', num_labels=3).to(device)

classes = np.array([0, 1, 2])
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y[target_col])
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir='./results-v2',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2 
)

train_size = int(0.9 * len(X))
test_size = len(X) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(diabetes_dataset, [train_size, test_size])

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)




Column Names: Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
Target Values Distribution: Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64
{'input_ids': tensor([ 101, 1015, 1012, 1014, 1015, 1012, 1014, 1015, 1012, 1014, 2871, 1012,
        1014, 1015, 1012, 1014, 1014, 1012, 1014, 1014, 1012, 1014, 1014, 1012,
        1014, 1014, 1012, 1014, 1015, 1012, 1014, 1014, 1012, 1014, 1015, 1012,
        1014, 1014, 1012, 1014, 1019, 1012, 1014, 2324, 1012, 1014, 2321, 1012,
        1014, 1015, 1012, 1014, 1014, 1012, 1014, 1023, 1012, 1014, 1018, 1012,
        1014, 1017, 1012, 1014,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,



train model

In [12]:
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")
print(f"Using device: {device}")

trainer.train()

trainer.save_model('./saved_model-v2')
tokenizer.save_pretrained('./saved_model-v2')

Number of training samples: 228312
Number of test samples: 25368
Using device: cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                     
  1%|          | 501/42810 [01:31<2:13:05,  5.30it/s]

{'loss': 0.9977, 'grad_norm': 1.7325440645217896, 'learning_rate': 1.976640971735576e-05, 'epoch': 0.04}


                                                      
  2%|▏         | 1001/42810 [03:03<2:11:35,  5.30it/s]

{'loss': 0.9435, 'grad_norm': 3.1131138801574707, 'learning_rate': 1.9532819434711517e-05, 'epoch': 0.07}


                                                      
  4%|▎         | 1501/42810 [04:36<2:12:22,  5.20it/s]

{'loss': 0.9648, 'grad_norm': 1.6499594449996948, 'learning_rate': 1.9299229152067275e-05, 'epoch': 0.11}


                                                      
  5%|▍         | 2001/42810 [06:10<2:10:51,  5.20it/s]

{'loss': 0.9653, 'grad_norm': 7.570612907409668, 'learning_rate': 1.9065638869423033e-05, 'epoch': 0.14}


                                                      
  6%|▌         | 2501/42810 [07:44<2:07:45,  5.26it/s]

{'loss': 0.9551, 'grad_norm': 3.0307888984680176, 'learning_rate': 1.883204858677879e-05, 'epoch': 0.18}


                                                      
  7%|▋         | 3001/42810 [09:19<2:07:16,  5.21it/s]

{'loss': 0.9874, 'grad_norm': 1.7163199186325073, 'learning_rate': 1.859845830413455e-05, 'epoch': 0.21}


                                                      
  8%|▊         | 3501/42810 [10:54<2:06:32,  5.18it/s]

{'loss': 0.978, 'grad_norm': 8.728795051574707, 'learning_rate': 1.8364868021490307e-05, 'epoch': 0.25}


                                                      
  9%|▉         | 4001/42810 [12:29<2:03:53,  5.22it/s]

{'loss': 0.9437, 'grad_norm': 3.3595025539398193, 'learning_rate': 1.8131277738846065e-05, 'epoch': 0.28}


                                                      
 11%|█         | 4501/42810 [14:04<2:05:39,  5.08it/s]

{'loss': 0.9577, 'grad_norm': 2.7693777084350586, 'learning_rate': 1.7897687456201826e-05, 'epoch': 0.32}


                                                      
 12%|█▏        | 5001/42810 [15:40<2:01:39,  5.18it/s]

{'loss': 0.977, 'grad_norm': 4.756241798400879, 'learning_rate': 1.766409717355758e-05, 'epoch': 0.35}


                                                      
 13%|█▎        | 5501/42810 [17:15<2:02:19,  5.08it/s]

{'loss': 0.9577, 'grad_norm': 1.7599831819534302, 'learning_rate': 1.7430506890913338e-05, 'epoch': 0.39}


                                                      
 14%|█▍        | 6001/42810 [18:51<1:58:55,  5.16it/s]

{'loss': 0.983, 'grad_norm': 3.5723483562469482, 'learning_rate': 1.7196916608269096e-05, 'epoch': 0.42}


                                                      
 15%|█▌        | 6501/42810 [20:27<2:00:57,  5.00it/s]

{'loss': 0.9579, 'grad_norm': 5.195213317871094, 'learning_rate': 1.6963326325624857e-05, 'epoch': 0.46}


                                                      
 16%|█▋        | 7001/42810 [22:03<1:56:41,  5.11it/s]

{'loss': 0.9383, 'grad_norm': 9.231348991394043, 'learning_rate': 1.6729736042980612e-05, 'epoch': 0.49}


                                                      
 18%|█▊        | 7501/42810 [23:39<1:56:05,  5.07it/s]

{'loss': 1.0151, 'grad_norm': 3.2308695316314697, 'learning_rate': 1.649614576033637e-05, 'epoch': 0.53}


                                                      
 19%|█▊        | 8001/42810 [25:16<1:53:26,  5.11it/s]

{'loss': 0.945, 'grad_norm': 2.990084648132324, 'learning_rate': 1.626255547769213e-05, 'epoch': 0.56}


                                                      
 20%|█▉        | 8501/42810 [26:52<1:51:13,  5.14it/s]

{'loss': 0.9694, 'grad_norm': 4.1546311378479, 'learning_rate': 1.602896519504789e-05, 'epoch': 0.6}


                                                      
 21%|██        | 9001/42810 [28:28<1:49:35,  5.14it/s]

{'loss': 0.942, 'grad_norm': 1.4323540925979614, 'learning_rate': 1.5795374912403643e-05, 'epoch': 0.63}


                                                      
 22%|██▏       | 9501/42810 [30:04<1:48:04,  5.14it/s]

{'loss': 0.986, 'grad_norm': 3.265820264816284, 'learning_rate': 1.55617846297594e-05, 'epoch': 0.67}


                                                       
 23%|██▎       | 10001/42810 [31:40<1:46:34,  5.13it/s]

{'loss': 0.9817, 'grad_norm': 1.8039106130599976, 'learning_rate': 1.5328194347115163e-05, 'epoch': 0.7}


                                                       
 25%|██▍       | 10501/42810 [33:16<1:44:29,  5.15it/s]

{'loss': 0.9525, 'grad_norm': 9.566300392150879, 'learning_rate': 1.5094604064470919e-05, 'epoch': 0.74}


                                                       
 26%|██▌       | 11001/42810 [34:53<1:42:39,  5.16it/s]

{'loss': 1.0038, 'grad_norm': 7.188937664031982, 'learning_rate': 1.4861013781826677e-05, 'epoch': 0.77}


                                                       
 27%|██▋       | 11501/42810 [36:29<1:41:33,  5.14it/s]

{'loss': 0.9571, 'grad_norm': 2.256838798522949, 'learning_rate': 1.4627423499182436e-05, 'epoch': 0.81}


                                                       
 28%|██▊       | 12001/42810 [38:05<1:39:06,  5.18it/s]

{'loss': 0.9612, 'grad_norm': 3.73210072517395, 'learning_rate': 1.4393833216538194e-05, 'epoch': 0.84}


                                                       
 29%|██▉       | 12501/42810 [39:40<1:38:04,  5.15it/s]

{'loss': 0.9622, 'grad_norm': 8.999156951904297, 'learning_rate': 1.416024293389395e-05, 'epoch': 0.88}


                                                       
 30%|███       | 13001/42810 [41:16<1:36:54,  5.13it/s]

{'loss': 0.9495, 'grad_norm': 2.4548707008361816, 'learning_rate': 1.392665265124971e-05, 'epoch': 0.91}


                                                       
 32%|███▏      | 13501/42810 [42:52<1:34:26,  5.17it/s]

{'loss': 0.9836, 'grad_norm': 2.7676970958709717, 'learning_rate': 1.3693062368605468e-05, 'epoch': 0.95}


                                                       
 33%|███▎      | 14001/42810 [44:28<1:34:24,  5.09it/s]

{'loss': 0.9841, 'grad_norm': 4.530956268310547, 'learning_rate': 1.3459472085961226e-05, 'epoch': 0.98}


 33%|███▎      | 14270/42810 [45:20<1:21:43,  5.82it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.9468518495559692, 'eval_runtime': 93.8479, 'eval_samples_per_second': 270.31, 'eval_steps_per_second': 16.9, 'epoch': 1.0}


                                                         
 34%|███▍      | 14501/42810 [47:39<1:31:18,  5.17it/s]

{'loss': 0.9249, 'grad_norm': 1.474007248878479, 'learning_rate': 1.3225881803316982e-05, 'epoch': 1.02}


                                                       
 35%|███▌      | 15001/42810 [49:15<1:29:29,  5.18it/s]

{'loss': 0.9829, 'grad_norm': 3.959028720855713, 'learning_rate': 1.2992291520672741e-05, 'epoch': 1.05}


                                                       
 36%|███▌      | 15501/42810 [50:51<1:30:24,  5.03it/s]

{'loss': 0.973, 'grad_norm': 2.4034388065338135, 'learning_rate': 1.27587012380285e-05, 'epoch': 1.09}


                                                       
 37%|███▋      | 16001/42810 [52:26<1:26:19,  5.18it/s]

{'loss': 0.9051, 'grad_norm': 6.03923225402832, 'learning_rate': 1.2525110955384257e-05, 'epoch': 1.12}


                                                       
 39%|███▊      | 16501/42810 [54:02<1:26:16,  5.08it/s]

{'loss': 0.972, 'grad_norm': 1.6774896383285522, 'learning_rate': 1.2291520672740017e-05, 'epoch': 1.16}


                                                       
 40%|███▉      | 17001/42810 [55:38<1:24:01,  5.12it/s]

{'loss': 0.9906, 'grad_norm': 10.895459175109863, 'learning_rate': 1.2057930390095773e-05, 'epoch': 1.19}


                                                       
 41%|████      | 17501/42810 [57:14<1:21:31,  5.17it/s]

{'loss': 0.9837, 'grad_norm': 8.608818054199219, 'learning_rate': 1.182434010745153e-05, 'epoch': 1.23}


                                                       
 42%|████▏     | 18001/42810 [58:49<1:20:02,  5.17it/s]

{'loss': 0.9932, 'grad_norm': 1.2875953912734985, 'learning_rate': 1.1590749824807289e-05, 'epoch': 1.26}


                                                         
 43%|████▎     | 18501/42810 [1:00:25<1:18:18,  5.17it/s]

{'loss': 0.969, 'grad_norm': 6.241311073303223, 'learning_rate': 1.1357159542163048e-05, 'epoch': 1.3}


                                                         
 44%|████▍     | 19001/42810 [1:02:00<1:16:37,  5.18it/s]

{'loss': 0.9444, 'grad_norm': 8.91116714477539, 'learning_rate': 1.1123569259518804e-05, 'epoch': 1.33}


                                                         
 46%|████▌     | 19501/42810 [1:03:36<1:14:58,  5.18it/s]

{'loss': 1.0207, 'grad_norm': 1.9705525636672974, 'learning_rate': 1.0889978976874562e-05, 'epoch': 1.37}


                                                         
 47%|████▋     | 20001/42810 [1:05:11<1:13:33,  5.17it/s]

{'loss': 0.9223, 'grad_norm': 1.8674474954605103, 'learning_rate': 1.0656388694230322e-05, 'epoch': 1.4}


                                                         
 48%|████▊     | 20501/42810 [1:06:47<1:12:04,  5.16it/s]

{'loss': 0.944, 'grad_norm': 4.996191501617432, 'learning_rate': 1.042279841158608e-05, 'epoch': 1.44}


                                                         
 49%|████▉     | 21001/42810 [1:08:22<1:10:17,  5.17it/s]

{'loss': 0.983, 'grad_norm': 2.16463041305542, 'learning_rate': 1.0189208128941836e-05, 'epoch': 1.47}


                                                         
 50%|█████     | 21501/42810 [1:09:58<1:09:06,  5.14it/s]

{'loss': 0.9615, 'grad_norm': 8.11898136138916, 'learning_rate': 9.955617846297595e-06, 'epoch': 1.51}


                                                         
 51%|█████▏    | 22001/42810 [1:11:33<1:07:01,  5.17it/s]

{'loss': 0.932, 'grad_norm': 9.510753631591797, 'learning_rate': 9.722027563653352e-06, 'epoch': 1.54}


                                                         
 53%|█████▎    | 22501/42810 [1:13:09<1:06:57,  5.06it/s]

{'loss': 0.9697, 'grad_norm': 1.4239226579666138, 'learning_rate': 9.488437281009111e-06, 'epoch': 1.58}


                                                         
 54%|█████▎    | 23001/42810 [1:14:44<1:03:46,  5.18it/s]

{'loss': 0.9687, 'grad_norm': 1.254603624343872, 'learning_rate': 9.254846998364869e-06, 'epoch': 1.61}


                                                         
 55%|█████▍    | 23501/42810 [1:16:20<1:02:11,  5.17it/s]

{'loss': 0.969, 'grad_norm': 8.242960929870605, 'learning_rate': 9.021256715720627e-06, 'epoch': 1.65}


                                                         
 56%|█████▌    | 24001/42810 [1:17:55<1:00:43,  5.16it/s]

{'loss': 0.9362, 'grad_norm': 8.384076118469238, 'learning_rate': 8.787666433076385e-06, 'epoch': 1.68}


                                                         
 57%|█████▋    | 24501/42810 [1:19:30<59:10,  5.16it/s]

{'loss': 0.9478, 'grad_norm': 2.198798179626465, 'learning_rate': 8.554076150432143e-06, 'epoch': 1.72}


                                                       
 58%|█████▊    | 25001/42810 [1:21:06<57:06,  5.20it/s]

{'loss': 0.9427, 'grad_norm': 4.194502353668213, 'learning_rate': 8.3204858677879e-06, 'epoch': 1.75}


                                                       
 60%|█████▉    | 25501/42810 [1:22:41<55:43,  5.18it/s]

{'loss': 0.9933, 'grad_norm': 8.927908897399902, 'learning_rate': 8.086895585143658e-06, 'epoch': 1.79}


                                                       
 61%|██████    | 26001/42810 [1:24:16<55:42,  5.03it/s]

{'loss': 0.9903, 'grad_norm': 1.7967095375061035, 'learning_rate': 7.853305302499416e-06, 'epoch': 1.82}


                                                       
 62%|██████▏   | 26501/42810 [1:25:51<52:24,  5.19it/s]

{'loss': 0.9539, 'grad_norm': 8.417523384094238, 'learning_rate': 7.619715019855175e-06, 'epoch': 1.86}


                                                       
 63%|██████▎   | 27001/42810 [1:27:26<51:01,  5.16it/s]

{'loss': 0.9808, 'grad_norm': 1.434808373451233, 'learning_rate': 7.386124737210932e-06, 'epoch': 1.89}


                                                       
 64%|██████▍   | 27501/42810 [1:29:02<49:44,  5.13it/s]

{'loss': 0.9618, 'grad_norm': 2.8829078674316406, 'learning_rate': 7.152534454566691e-06, 'epoch': 1.93}


                                                       
 65%|██████▌   | 28001/42810 [1:30:38<47:49,  5.16it/s]

{'loss': 0.9219, 'grad_norm': 16.635196685791016, 'learning_rate': 6.918944171922448e-06, 'epoch': 1.96}


                                                       
 67%|██████▋   | 28501/42810 [1:32:14<47:06,  5.06it/s]

{'loss': 0.9664, 'grad_norm': 6.517238140106201, 'learning_rate': 6.685353889278207e-06, 'epoch': 2.0}


 67%|██████▋   | 28540/42810 [1:32:21<40:27,  5.88it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.9499430656433105, 'eval_runtime': 94.5028, 'eval_samples_per_second': 268.436, 'eval_steps_per_second': 16.783, 'epoch': 2.0}


                                                           
 68%|██████▊   | 29001/42810 [1:35:26<45:23,  5.07it/s]

{'loss': 0.9284, 'grad_norm': 1.9740209579467773, 'learning_rate': 6.451763606633965e-06, 'epoch': 2.03}


                                                       
 69%|██████▉   | 29501/42810 [1:37:02<43:15,  5.13it/s]

{'loss': 0.9322, 'grad_norm': 4.8087382316589355, 'learning_rate': 6.218173323989722e-06, 'epoch': 2.07}


                                                       
 70%|███████   | 30001/42810 [1:38:37<41:11,  5.18it/s]

{'loss': 0.9621, 'grad_norm': 2.1256163120269775, 'learning_rate': 5.984583041345481e-06, 'epoch': 2.1}


                                                       
 71%|███████   | 30501/42810 [1:40:13<40:09,  5.11it/s]

{'loss': 0.9253, 'grad_norm': 8.071327209472656, 'learning_rate': 5.750992758701238e-06, 'epoch': 2.14}


                                                       
 72%|███████▏  | 31001/42810 [1:41:48<37:56,  5.19it/s]

{'loss': 0.9837, 'grad_norm': 1.98247492313385, 'learning_rate': 5.517402476056997e-06, 'epoch': 2.17}


                                                       
 74%|███████▎  | 31501/42810 [1:43:24<37:01,  5.09it/s]

{'loss': 0.9082, 'grad_norm': 5.064408779144287, 'learning_rate': 5.283812193412754e-06, 'epoch': 2.21}


                                                       
 75%|███████▍  | 32001/42810 [1:44:59<35:01,  5.14it/s]

{'loss': 0.9402, 'grad_norm': 2.9581384658813477, 'learning_rate': 5.050221910768513e-06, 'epoch': 2.24}


                                                       
 76%|███████▌  | 32501/42810 [1:46:35<33:16,  5.16it/s]

{'loss': 1.0017, 'grad_norm': 1.8103324174880981, 'learning_rate': 4.8166316281242705e-06, 'epoch': 2.28}


                                                       
 77%|███████▋  | 33001/42810 [1:48:11<31:38,  5.17it/s]

{'loss': 0.9933, 'grad_norm': 7.494729518890381, 'learning_rate': 4.583041345480028e-06, 'epoch': 2.31}


                                                       
 78%|███████▊  | 33501/42810 [1:49:46<30:02,  5.17it/s]

{'loss': 0.9336, 'grad_norm': 1.0788880586624146, 'learning_rate': 4.349451062835787e-06, 'epoch': 2.35}


                                                       
 79%|███████▉  | 34001/42810 [1:51:22<28:40,  5.12it/s]

{'loss': 0.943, 'grad_norm': 1.4429479837417603, 'learning_rate': 4.115860780191545e-06, 'epoch': 2.38}


                                                       
 81%|████████  | 34501/42810 [1:52:58<26:55,  5.14it/s]

{'loss': 0.9856, 'grad_norm': 6.526536464691162, 'learning_rate': 3.882270497547303e-06, 'epoch': 2.42}


                                                       
 82%|████████▏ | 35001/42810 [1:54:34<25:22,  5.13it/s]

{'loss': 0.9489, 'grad_norm': 8.105226516723633, 'learning_rate': 3.6486802149030603e-06, 'epoch': 2.45}


                                                       
 83%|████████▎ | 35501/42810 [1:56:09<23:39,  5.15it/s]

{'loss': 0.9703, 'grad_norm': 1.2616782188415527, 'learning_rate': 3.415089932258818e-06, 'epoch': 2.49}


                                                       
 84%|████████▍ | 36001/42810 [1:57:45<21:58,  5.16it/s]

{'loss': 0.9404, 'grad_norm': 3.5631933212280273, 'learning_rate': 3.181499649614576e-06, 'epoch': 2.52}


                                                       
 85%|████████▌ | 36501/42810 [1:59:21<20:20,  5.17it/s]

{'loss': 0.9654, 'grad_norm': 1.3971985578536987, 'learning_rate': 2.947909366970334e-06, 'epoch': 2.56}


                                                       
 86%|████████▋ | 37001/42810 [2:00:57<18:54,  5.12it/s]

{'loss': 0.9397, 'grad_norm': 7.86318826675415, 'learning_rate': 2.7143190843260926e-06, 'epoch': 2.59}


                                                       
 88%|████████▊ | 37501/42810 [2:02:32<17:06,  5.17it/s]

{'loss': 0.9366, 'grad_norm': 1.6446701288223267, 'learning_rate': 2.4807288016818505e-06, 'epoch': 2.63}


                                                       
 89%|████████▉ | 38001/42810 [2:04:08<15:33,  5.15it/s]

{'loss': 0.9594, 'grad_norm': 1.913921594619751, 'learning_rate': 2.2471385190376084e-06, 'epoch': 2.66}


                                                       
 90%|████████▉ | 38501/42810 [2:05:43<13:53,  5.17it/s]

{'loss': 0.95, 'grad_norm': 8.254220008850098, 'learning_rate': 2.0135482363933663e-06, 'epoch': 2.7}


                                                       
 91%|█████████ | 39001/42810 [2:07:19<12:18,  5.15it/s]

{'loss': 0.9343, 'grad_norm': 2.840151309967041, 'learning_rate': 1.7799579537491241e-06, 'epoch': 2.73}


                                                       
 92%|█████████▏| 39501/42810 [2:08:55<10:41,  5.15it/s]

{'loss': 0.9354, 'grad_norm': 2.5326297283172607, 'learning_rate': 1.5463676711048822e-06, 'epoch': 2.77}


                                                       
 93%|█████████▎| 40001/42810 [2:10:31<09:04,  5.16it/s]

{'loss': 0.9978, 'grad_norm': 1.7430317401885986, 'learning_rate': 1.3127773884606401e-06, 'epoch': 2.8}


                                                       
 95%|█████████▍| 40501/42810 [2:12:07<07:26,  5.17it/s]

{'loss': 0.9548, 'grad_norm': 6.501089096069336, 'learning_rate': 1.0791871058163982e-06, 'epoch': 2.84}


                                                       
 96%|█████████▌| 41001/42810 [2:13:43<05:50,  5.16it/s]

{'loss': 0.9335, 'grad_norm': 2.8025553226470947, 'learning_rate': 8.455968231721561e-07, 'epoch': 2.87}


                                                       
 97%|█████████▋| 41501/42810 [2:15:19<04:13,  5.16it/s]

{'loss': 0.9746, 'grad_norm': 6.794262409210205, 'learning_rate': 6.120065405279141e-07, 'epoch': 2.91}


                                                       
 98%|█████████▊| 42001/42810 [2:16:54<02:36,  5.18it/s]

{'loss': 0.9798, 'grad_norm': 1.4767439365386963, 'learning_rate': 3.7841625788367204e-07, 'epoch': 2.94}


                                                       
 99%|█████████▉| 42501/42810 [2:18:29<00:59,  5.17it/s]

{'loss': 0.982, 'grad_norm': 2.9283571243286133, 'learning_rate': 1.4482597523943005e-07, 'epoch': 2.98}


100%|██████████| 42810/42810 [2:19:28<00:00,  5.95it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.9457257986068726, 'eval_runtime': 93.6909, 'eval_samples_per_second': 270.763, 'eval_steps_per_second': 16.928, 'epoch': 3.0}


                                                       
100%|██████████| 42810/42810 [2:21:04<00:00,  5.06it/s]


{'train_runtime': 8464.3658, 'train_samples_per_second': 80.92, 'train_steps_per_second': 5.058, 'train_loss': 0.9615401191595586, 'epoch': 3.0}


('./saved_model-v2\\tokenizer_config.json',
 './saved_model-v2\\special_tokens_map.json',
 './saved_model-v2\\vocab.txt',
 './saved_model-v2\\added_tokens.json',
 './saved_model-v2\\tokenizer.json')

In [None]:

trainer.save_model('./saved_model-v2')
tokenizer.save_pretrained('./saved_model-v2')

{'input_ids': tensor([ 101, 1015, 1015, 1015, 2871, 1015, 1014, 1014, 1014, 1014, 1015, 1014,
        1015, 1014, 1019, 2324, 2321, 1015, 1014, 1023, 1018, 1017,  102,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0

checking all good before training

In [5]:
# Verify Data Loaders
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")
print(f"Using device: {device}")



Number of training samples: 228312
Number of test samples: 25368
Using device: cuda


evaluate model

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

report = classification_report(labels, preds, target_names=['No Diabetes', 'Pre-Diabetic', 'Diabetic'])
matrix = confusion_matrix(labels, preds)

print(report)
print(matrix)


100%|██████████| 1586/1586 [01:30<00:00, 17.59it/s]

              precision    recall  f1-score   support

 No Diabetes       0.94      0.76      0.84     21375
Pre-Diabetic       0.00      0.00      0.00       448
    Diabetic       0.33      0.75      0.46      3545

    accuracy                           0.74     25368
   macro avg       0.42      0.50      0.43     25368
weighted avg       0.83      0.74      0.77     25368

[[16237     0  5138]
 [  203     0   245]
 [  903     0  2642]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


train dataset report

In [18]:
train_predictions = trainer.predict(train_dataset)
train_preds = train_predictions.predictions.argmax(-1)
train_labels = train_predictions.label_ids

train_report = classification_report(train_labels, train_preds, target_names=['No Diabetes', 'Pre-Diabetic', 'Diabetic'])
train_matrix = confusion_matrix(train_labels, train_preds)

print(train_report)
print(train_matrix)


100%|██████████| 14270/14270 [14:11<00:00, 16.76it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

 No Diabetes       0.87      0.97      0.92    192328
Pre-Diabetic       0.00      0.00      0.00      4159
    Diabetic       0.55      0.23      0.32     31825

    accuracy                           0.85    228312
   macro avg       0.47      0.40      0.41    228312
weighted avg       0.81      0.85      0.82    228312

[[186932      0   5396]
 [  3663      0    496]
 [ 24644      0   7181]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

report = classification_report(labels, preds, target_names=['No Diabetes', 'Pre-Diabetic', 'Diabetic'], zero_division=0)
matrix = confusion_matrix(labels, preds)

micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(labels, preds, average='micro')

matrix_percentage = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis] * 100

print("Classification Report:\n", report)
print("Confusion Matrix (Counts):\n", matrix)
print("Confusion Matrix (Percentages):\n", matrix_percentage)
print(f"Micro-Average Precision: {micro_precision:.2f}")
print(f"Micro-Average Recall: {micro_recall:.2f}")
print(f"Micro-Average F1-Score: {micro_f1:.2f}")

#  matrix as percentages
categories = ['No Diabetes', 'Pre-Diabetic', 'Diabetic']
print("\nConfusion Matrix as Percentages:")
for i, category in enumerate(categories):
    print(f"\nActual: {category}")
    for j, predicted_category in enumerate(categories):
        print(f"Predicted as {predicted_category}: {matrix_percentage[i][j]:.2f}%")


Classification Report:
               precision    recall  f1-score   support

 No Diabetes       0.94      0.76      0.84     21375
Pre-Diabetic       0.00      0.00      0.00       448
    Diabetic       0.33      0.75      0.46      3545

    accuracy                           0.74     25368
   macro avg       0.42      0.50      0.43     25368
weighted avg       0.83      0.74      0.77     25368

Confusion Matrix (Counts):
 [[16237     0  5138]
 [  203     0   245]
 [  903     0  2642]]
Confusion Matrix (Percentages):
 [[75.9625731   0.         24.0374269 ]
 [45.3125      0.         54.6875    ]
 [25.47249647  0.         74.52750353]]
Micro-Average Precision: 0.74
Micro-Average Recall: 0.74
Micro-Average F1-Score: 0.74

Confusion Matrix as Percentages:

Actual: No Diabetes
Predicted as No Diabetes: 75.96%
Predicted as Pre-Diabetic: 0.00%
Predicted as Diabetic: 24.04%

Actual: Pre-Diabetic
Predicted as No Diabetes: 45.31%
Predicted as Pre-Diabetic: 0.00%
Predicted as Diabetic: 54.6

new evaluation, version 2

In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import numpy as np

tokenizer = DistilBertTokenizerFast.from_pretrained('./saved_model-v2')
model = DistilBertForSequenceClassification.from_pretrained('./saved_model-v2').to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

data_path = './data/diabetes-dataset/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(data_path)

X = df.drop(columns=['Diabetes_012'])
y = df[['Diabetes_012']]

class DiabetesDataset(Dataset):
    def __init__(self, features_df, target_df, tokenizer, target_col):
        self.features_df = features_df
        self.labels = torch.tensor(target_df[target_col].values)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.features_df)

    def __getitem__(self, idx):
        features = self.features_df.iloc[idx]
        text = " ".join([str(val) for val in features])
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        label = self.labels[idx].item()
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

target_col = 'Diabetes_012'
diabetes_dataset = DiabetesDataset(X, y, tokenizer, target_col)

train_size = int(0.9 * len(X))
test_size = len(X) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(diabetes_dataset, [train_size, test_size])

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

model.eval()
all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

all_labels = np.array(all_labels)
all_preds = np.array(all_preds)

# Generate classification report and confusion matrix
report = classification_report(all_labels, all_preds, target_names=['No Diabetes', 'Pre-Diabetic', 'Diabetic'], zero_division=0)
matrix = confusion_matrix(all_labels, all_preds)
micro_precision, micro_recall, micro_f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
matrix_percentage = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis] * 100

print("Classification Report:\n", report)
print("Confusion Matrix (Counts):\n", matrix)
print("Confusion Matrix (Percentages):\n", matrix_percentage)
print(f"Micro-Average Precision: {micro_precision:.2f}")
print(f"Micro-Average Recall: {micro_recall:.2f}")
print(f"Micro-Average F1-Score: {micro_f1:.2f}")

# Confusion matrix as percentages
categories = ['No Diabetes', 'Pre-Diabetic', 'Diabetic']
print("\nConfusion Matrix as Percentages:")
for i, category in enumerate(categories):
    print(f"\nActual: {category}")
    for j, predicted_category in enumerate(categories):
        print(f"Predicted as {predicted_category}: {matrix_percentage[i][j]:.2f}%")


  from .autonotebook import tqdm as notebook_tqdm
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Classification Report:
               precision    recall  f1-score   support

 No Diabetes       0.94      0.77      0.84     21423
Pre-Diabetic       0.00      0.00      0.00       451
    Diabetic       0.33      0.75      0.46      3494

    accuracy                           0.75     25368
   macro avg       0.42      0.51      0.43     25368
weighted avg       0.84      0.75      0.78     25368

Confusion Matrix (Counts):
 [[16410     0  5013]
 [  196     0   255]
 [  875     0  2619]]
Confusion Matrix (Percentages):
 [[76.59991598  0.         23.40008402]
 [43.45898004  0.         56.54101996]
 [25.04293074  0.         74.95706926]]
Micro-Average Precision: 0.75
Micro-Average Recall: 0.75
Micro-Average F1-Score: 0.75

Confusion Matrix as Percentages:

Actual: No Diabetes
Predicted as No Diabetes: 76.60%
Predicted as Pre-Diabetic: 0.00%
Predicted as Diabetic: 23.40%

Actual: Pre-Diabetic
Predicted as No Diabetes: 43.46%
Predicted as Pre-Diabetic: 0.00%
Predicted as Diabetic: 56.5

Using focal loss to solve class imbalance, and creating saved_model-v3

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import numpy as np

# Focal Loss definition
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        device = inputs.device
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        if self.alpha is not None:
            self.alpha = self.alpha.to(device)  # Ensure alpha is on the same device
            alpha_t = self.alpha.gather(0, targets)
            F_loss = alpha_t * (1 - pt) ** self.gamma * BCE_loss
        else:
            F_loss = (1 - pt) ** self.gamma * BCE_loss
        if self.reduction == 'mean':
            return F_loss.mean()
        elif self.reduction == 'sum':
            return F_loss.sum()
        else:
            return F_loss

# Adjust the alpha parameter to give higher weights to underrepresented classes
# [No Diabetes, Pre-Diabetic, Diabetic]
alpha = torch.tensor([1, 5, 3], dtype=torch.float32)  # giving higher weight to "Pre-Diabetic" and "Diabetic"
focal_loss = FocalLoss(gamma=2.0, alpha=alpha)

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('./saved_model-v2')
model = DistilBertForSequenceClassification.from_pretrained('./saved_model-v2', num_labels=3)

# Load and prepare dataset
data_path = './data/diabetes-dataset/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(data_path)

# Partition data
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, 1:], df[['Diabetes_012']], test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

class DiabetesDataset(Dataset):
    def __init__(self, features_df, target_df, tokenizer, target_col):
        self.features_df = features_df
        self.labels = torch.tensor(target_df[target_col].values, dtype=torch.long)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.features_df)

    def __getitem__(self, idx):
        features = self.features_df.iloc[idx]
        text = " ".join([str(val) for val in features])
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        label = self.labels[idx].item()
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

target_col = 'Diabetes_012'
train_dataset = DiabetesDataset(X_train, y_train, tokenizer, target_col)
val_dataset = DiabetesDataset(X_val, y_val, tokenizer, target_col)
test_dataset = DiabetesDataset(X_test, y_test, tokenizer, target_col)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results-v3',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Custom loss function for the Trainer
def custom_loss(outputs, labels):
    return focal_loss(outputs.logits, labels)

# Trainer class
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = custom_loss(outputs, labels)
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # use validation dataset for evaluation
)





In [14]:
trainer.train()

trainer.save_model('./saved_model-v3')
tokenizer.save_pretrained('./saved_model-v3')


  0%|          | 0/30444 [02:10<?, ?it/s]
                                         
  0%|          | 0/30444 [02:46<?, ?it/s]            

{'loss': 0.4303, 'grad_norm': 0.9933412671089172, 'learning_rate': 4.917882012876101e-05, 'epoch': 0.05}


                                         
  0%|          | 0/30444 [04:22<?, ?it/s]             

{'loss': 0.4299, 'grad_norm': 0.9223166704177856, 'learning_rate': 4.8357640257522006e-05, 'epoch': 0.1}


                                         
  0%|          | 0/30444 [06:00<?, ?it/s]             

{'loss': 0.4477, 'grad_norm': 1.6941907405853271, 'learning_rate': 4.753646038628302e-05, 'epoch': 0.15}


                                         
  0%|          | 0/30444 [07:39<?, ?it/s]             

{'loss': 0.4156, 'grad_norm': 0.7587496042251587, 'learning_rate': 4.6715280515044016e-05, 'epoch': 0.2}


                                         
  0%|          | 0/30444 [09:23<?, ?it/s]             

{'loss': 0.3964, 'grad_norm': 2.737266778945923, 'learning_rate': 4.589410064380502e-05, 'epoch': 0.25}


                                         
  0%|          | 0/30444 [11:08<?, ?it/s]             

{'loss': 0.4096, 'grad_norm': 1.4546306133270264, 'learning_rate': 4.507292077256603e-05, 'epoch': 0.3}


                                         
  0%|          | 0/30444 [12:56<?, ?it/s]             

{'loss': 0.4103, 'grad_norm': 0.6476216316223145, 'learning_rate': 4.4251740901327025e-05, 'epoch': 0.34}


                                         
  0%|          | 0/30444 [14:44<?, ?it/s]             

{'loss': 0.4151, 'grad_norm': 1.1395244598388672, 'learning_rate': 4.343056103008804e-05, 'epoch': 0.39}


                                         
  0%|          | 0/30444 [16:30<?, ?it/s]             

{'loss': 0.4301, 'grad_norm': 11.032447814941406, 'learning_rate': 4.2609381158849036e-05, 'epoch': 0.44}


                                         
  0%|          | 0/30444 [18:19<?, ?it/s]             

{'loss': 0.3996, 'grad_norm': 3.843038320541382, 'learning_rate': 4.178820128761004e-05, 'epoch': 0.49}


                                         
  0%|          | 0/30444 [20:07<?, ?it/s]             

{'loss': 0.4193, 'grad_norm': 0.5971404314041138, 'learning_rate': 4.0967021416371046e-05, 'epoch': 0.54}


                                         
  0%|          | 0/30444 [21:52<?, ?it/s]             

{'loss': 0.4357, 'grad_norm': 0.911104679107666, 'learning_rate': 4.0145841545132044e-05, 'epoch': 0.59}


                                         
  0%|          | 0/30444 [23:37<?, ?it/s]             

{'loss': 0.4202, 'grad_norm': 1.2117100954055786, 'learning_rate': 3.9324661673893056e-05, 'epoch': 0.64}




KeyboardInterrupt: 

Training takes excessively long this way, following is a faster way to train it.

Threfore to make it run faster we will implement the following to lower the time.

Learning rate: set as before to 2e-5.

Gradient Accumulation: Accumulate gradients over 4 steps with gradient_accumulation_steps=4.

Mixed Precision Training: Enabled mixed precision training with fp16=True.

In [15]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import numpy as np

# Focal Loss definition
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        device = inputs.device
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        if self.alpha is not None:
            self.alpha = self.alpha.to(device)  # Ensure alpha is on the same device
            alpha_t = self.alpha.gather(0, targets)
            F_loss = alpha_t * (1 - pt) ** self.gamma * BCE_loss
        else:
            F_loss = (1 - pt) ** self.gamma * BCE_loss
        if self.reduction == 'mean':
            return F_loss.mean()
        elif self.reduction == 'sum':
            return F_loss.sum()
        else:
            return F_loss

# Adjust the alpha parameter to give higher weights to underrepresented classes
# [No Diabetes, Pre-Diabetic, Diabetic]
alpha = torch.tensor([1, 5, 3], dtype=torch.float32)  # giving higher weight to "Pre-Diabetic" and "Diabetic"
focal_loss = FocalLoss(gamma=2.0, alpha=alpha)

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('./saved_model-v2')
model = DistilBertForSequenceClassification.from_pretrained('./saved_model-v2', num_labels=3)

# Load and prepare dataset
data_path = './data/diabetes-dataset/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(data_path)

# Partition data
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, 1:], df[['Diabetes_012']], test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Define your dataset class
class DiabetesDataset(Dataset):
    def __init__(self, features_df, target_df, tokenizer, target_col):
        self.features_df = features_df
        self.labels = torch.tensor(target_df[target_col].values, dtype=torch.long)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.features_df)

    def __getitem__(self, idx):
        features = self.features_df.iloc[idx]
        text = " ".join([str(val) for val in features])
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        label = self.labels[idx].item()
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

target_col = 'Diabetes_012'
train_dataset = DiabetesDataset(X_train, y_train, tokenizer, target_col)
val_dataset = DiabetesDataset(X_val, y_val, tokenizer, target_col)
test_dataset = DiabetesDataset(X_test, y_test, tokenizer, target_col)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results-v3',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    fp16=True  # Enable mixed precision training
)

# Custom loss function for the Trainer
def custom_loss(outputs, labels):
    return focal_loss(outputs.logits, labels)

# Define Trainer class
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = custom_loss(outputs, labels)
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # use validation dataset for evaluation
)

trainer.train()

trainer.save_model('./saved_model-v3')
tokenizer.save_pretrained('./saved_model-v3')


  0%|          | 0/30444 [23:54<?, ?it/s]
                                                  
  7%|▋         | 500/7611 [02:26<34:50,  3.40it/s]    

{'loss': 1.5466, 'grad_norm': 3.0694215297698975, 'learning_rate': 1.8688739981605574e-05, 'epoch': 0.2}


                                                    
 13%|█▎        | 1000/7611 [04:56<31:50,  3.46it/s]   

{'loss': 1.4579, 'grad_norm': 2.710876226425171, 'learning_rate': 1.737485218762318e-05, 'epoch': 0.39}


                                                     
 20%|█▉        | 1500/7611 [07:25<30:09,  3.38it/s]   

{'loss': 1.5181, 'grad_norm': 2.882890224456787, 'learning_rate': 1.6060964393640784e-05, 'epoch': 0.59}


                                                     
 26%|██▋       | 2000/7611 [09:53<27:50,  3.36it/s]   

{'loss': 1.5262, 'grad_norm': 2.4798054695129395, 'learning_rate': 1.474707659965839e-05, 'epoch': 0.79}


                                                   
 33%|███▎      | 2500/7611 [12:23<25:25,  3.35it/s]   

{'loss': 1.5115, 'grad_norm': 3.7705302238464355, 'learning_rate': 1.3435816581263961e-05, 'epoch': 0.99}


 33%|███▎      | 2537/7611 [12:36<26:22,  3.21it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.3715057373046875, 'eval_runtime': 64.1514, 'eval_samples_per_second': 632.707, 'eval_steps_per_second': 39.547, 'epoch': 1.0}


                                                      
 39%|███▉      | 3000/7611 [15:58<22:37,  3.40it/s]   

{'loss': 1.4934, 'grad_norm': 3.4327468872070312, 'learning_rate': 1.2121928787281568e-05, 'epoch': 1.18}


                                                   
 46%|████▌     | 3500/7611 [18:27<19:44,  3.47it/s]   

{'loss': 1.4919, 'grad_norm': 3.0277788639068604, 'learning_rate': 1.0808040993299174e-05, 'epoch': 1.38}


                                                   
 53%|█████▎    | 4000/7611 [20:58<17:21,  3.47it/s]   

{'loss': 1.4881, 'grad_norm': 2.522569179534912, 'learning_rate': 9.494153199316779e-06, 'epoch': 1.58}


                                                   
 59%|█████▉    | 4500/7611 [23:29<15:26,  3.36it/s]   

{'loss': 1.4792, 'grad_norm': 3.907831907272339, 'learning_rate': 8.18289318092235e-06, 'epoch': 1.77}


                                                   
 66%|██████▌   | 5000/7611 [25:56<12:44,  3.42it/s]   

{'loss': 1.4988, 'grad_norm': 3.266434907913208, 'learning_rate': 6.869005386939956e-06, 'epoch': 1.97}


 67%|██████▋   | 5074/7611 [26:19<12:12,  3.46it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.3667605519294739, 'eval_runtime': 63.1227, 'eval_samples_per_second': 643.018, 'eval_steps_per_second': 40.192, 'epoch': 2.0}


                                                      
 72%|███████▏  | 5500/7611 [29:26<10:12,  3.45it/s]   

{'loss': 1.522, 'grad_norm': 3.4815475940704346, 'learning_rate': 5.555117592957562e-06, 'epoch': 2.17}


                                                   
 79%|███████▉  | 6000/7611 [37:53<33:12,  1.24s/it]   

{'loss': 1.49, 'grad_norm': 4.415505886077881, 'learning_rate': 4.241229798975168e-06, 'epoch': 2.36}


                                                   
 85%|████████▌ | 6500/7611 [48:12<22:48,  1.23s/it]     

{'loss': 1.4727, 'grad_norm': 6.850468635559082, 'learning_rate': 2.9299697805807385e-06, 'epoch': 2.56}


                                                   
 92%|█████████▏| 7000/7611 [58:31<12:34,  1.23s/it]     

{'loss': 1.469, 'grad_norm': 6.059680938720703, 'learning_rate': 1.6160819865983447e-06, 'epoch': 2.76}


                                                     
 99%|█████████▊| 7500/7611 [1:08:51<02:17,  1.24s/it]   

{'loss': 1.5014, 'grad_norm': 3.7993664741516113, 'learning_rate': 3.021941926159506e-07, 'epoch': 2.96}


100%|██████████| 7611/7611 [1:11:09<00:00,  1.19s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.36899298429489136, 'eval_runtime': 222.4804, 'eval_samples_per_second': 182.439, 'eval_steps_per_second': 11.403, 'epoch': 3.0}
{'train_runtime': 4493.3901, 'train_samples_per_second': 108.396, 'train_steps_per_second': 1.694, 'train_loss': 1.4982005675539771, 'epoch': 3.0}


('./saved_model-v3\\tokenizer_config.json',
 './saved_model-v3\\special_tokens_map.json',
 './saved_model-v3\\vocab.txt',
 './saved_model-v3\\added_tokens.json',
 './saved_model-v3\\tokenizer.json')

testing new model v3

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

report = classification_report(labels, preds, target_names=['No Diabetes', 'Pre-Diabetic', 'Diabetic'])
matrix = confusion_matrix(labels, preds)

print(report)
print(matrix)


100%|██████████| 3171/3171 [39:27:30<00:00, 44.80s/it]        


              precision    recall  f1-score   support

 No Diabetes       0.92      0.85      0.88     42795
Pre-Diabetic       0.00      0.00      0.00       944
    Diabetic       0.38      0.63      0.48      6997

    accuracy                           0.80     50736
   macro avg       0.43      0.49      0.45     50736
weighted avg       0.83      0.80      0.81     50736

[[36198     0  6597]
 [  532     0   412]
 [ 2621     0  4376]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
   ---------------------------------------- 0.0/258.3 kB ? eta -:--:--
   - -------------------------------------- 10.2/258.3 kB ? eta -:--:--
   ----------------------- ---------------- 153.6/258.3 kB 1.8 MB/s eta 0:00:01
   ---------------------------------------- 258.3/258.3 kB 2.6 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4


Using SMOTE

In [18]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import numpy as np

# Focal Loss definition
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, inputs, targets):
        device = inputs.device
        BCE_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-BCE_loss)
        if self.alpha is not None:
            self.alpha = self.alpha.to(device)  # Ensure alpha is on the same device
            alpha_t = self.alpha.gather(0, targets)
            F_loss = alpha_t * (1 - pt) ** self.gamma * BCE_loss
        else:
            F_loss = (1 - pt) ** self.gamma * BCE_loss
        if self.reduction == 'mean':
            return F_loss.mean()
        elif self.reduction == 'sum':
            return F_loss.sum()
        else:
            return F_loss

# Adjust the alpha parameter to give higher weights to underrepresented classes
# [No Diabetes, Pre-Diabetic, Diabetic]
alpha = torch.tensor([1, 5, 3], dtype=torch.float32)  # giving higher weight to "Pre-Diabetic" and "Diabetic"
focal_loss = FocalLoss(gamma=2.0, alpha=alpha)

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('./saved_model-v3')
model = DistilBertForSequenceClassification.from_pretrained('./saved_model-v3', num_labels=3)

# Load and prepare dataset
data_path = './data/diabetes-dataset/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(data_path)

# Partition data
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, 1:], df[['Diabetes_012']], test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define your dataset class
class DiabetesDataset(Dataset):
    def __init__(self, features_df, target_df, tokenizer, target_col):
        self.features_df = features_df
        self.labels = torch.tensor(target_df[target_col].values, dtype=torch.long)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.features_df)

    def __getitem__(self, idx):
        features = self.features_df.iloc[idx]
        text = " ".join([str(val) for val in features])
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        label = self.labels[idx].item()
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

target_col = 'Diabetes_012'
train_dataset = DiabetesDataset(X_train_smote, y_train_smote, tokenizer, target_col)
val_dataset = DiabetesDataset(X_val, y_val, tokenizer, target_col)
test_dataset = DiabetesDataset(X_test, y_test, tokenizer, target_col)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results-v4',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    fp16=True  # Enable mixed precision training
)

# Custom loss function for the Trainer
def custom_loss(outputs, labels):
    return focal_loss(outputs.logits, labels)

# Define Trainer class
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = custom_loss(outputs, labels)
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # use validation dataset for evaluation
)

trainer.train()

trainer.save_model('./saved_model-v4')
tokenizer.save_pretrained('./saved_model-v4')



  3%|▎         | 500/19224 [02:17<1:24:58,  3.67it/s]
  3%|▎         | 500/19224 [02:17<1:24:58,  3.67it/s]   

{'loss': 2.3902, 'grad_norm': 13.41244125366211, 'learning_rate': 1.9481897627965046e-05, 'epoch': 0.08}


  5%|▌         | 1000/19224 [04:36<1:24:23,  3.60it/s]
  5%|▌         | 1000/19224 [04:36<1:24:23,  3.60it/s]   

{'loss': 2.0319, 'grad_norm': 14.018302917480469, 'learning_rate': 1.8961714523512277e-05, 'epoch': 0.16}


  8%|▊         | 1500/19224 [06:57<1:22:09,  3.60it/s]
  8%|▊         | 1500/19224 [06:57<1:22:09,  3.60it/s]   

{'loss': 2.0091, 'grad_norm': 30.5577449798584, 'learning_rate': 1.844153141905951e-05, 'epoch': 0.23}


 10%|█         | 2000/19224 [09:23<1:25:27,  3.36it/s]
 10%|█         | 2000/19224 [09:23<1:25:27,  3.36it/s]   

{'loss': 1.979, 'grad_norm': 9.305522918701172, 'learning_rate': 1.7921348314606744e-05, 'epoch': 0.31}


 13%|█▎        | 2500/19224 [11:51<1:19:37,  3.50it/s]
 13%|█▎        | 2500/19224 [11:51<1:19:37,  3.50it/s]   

{'loss': 1.954, 'grad_norm': 10.4048433303833, 'learning_rate': 1.740324594257179e-05, 'epoch': 0.39}


 16%|█▌        | 3000/19224 [14:17<1:17:42,  3.48it/s]
 16%|█▌        | 3000/19224 [14:17<1:17:42,  3.48it/s]   

{'loss': 1.9369, 'grad_norm': 18.0920352935791, 'learning_rate': 1.688306283811902e-05, 'epoch': 0.47}


 18%|█▊        | 3500/19224 [16:42<1:15:57,  3.45it/s]
 18%|█▊        | 3500/19224 [16:42<1:15:57,  3.45it/s]   

{'loss': 1.9133, 'grad_norm': 32.38367462158203, 'learning_rate': 1.6362879733666252e-05, 'epoch': 0.55}


 21%|██        | 4000/19224 [19:07<1:11:00,  3.57it/s]
 21%|██        | 4000/19224 [19:07<1:11:00,  3.57it/s]   

{'loss': 1.9066, 'grad_norm': 8.304769515991211, 'learning_rate': 1.5842696629213484e-05, 'epoch': 0.62}


 23%|██▎       | 4500/19224 [21:28<1:08:27,  3.58it/s]
 23%|██▎       | 4500/19224 [21:28<1:08:27,  3.58it/s]   

{'loss': 1.8612, 'grad_norm': 11.333222389221191, 'learning_rate': 1.5323553890969622e-05, 'epoch': 0.7}


 26%|██▌       | 5000/19224 [23:49<1:06:18,  3.58it/s]
 26%|██▌       | 5000/19224 [23:49<1:06:18,  3.58it/s]   

{'loss': 1.8543, 'grad_norm': 34.99017333984375, 'learning_rate': 1.4803370786516855e-05, 'epoch': 0.78}


 29%|██▊       | 5500/19224 [26:10<1:03:30,  3.60it/s]
 29%|██▊       | 5500/19224 [26:10<1:03:30,  3.60it/s]   

{'loss': 1.8293, 'grad_norm': 14.827713966369629, 'learning_rate': 1.4283187682064087e-05, 'epoch': 0.86}


 31%|███       | 6000/19224 [28:31<1:01:32,  3.58it/s]
 31%|███       | 6000/19224 [28:31<1:01:32,  3.58it/s]   

{'loss': 1.8135, 'grad_norm': 12.113957405090332, 'learning_rate': 1.376300457761132e-05, 'epoch': 0.94}


 33%|███▎      | 6408/19224 [30:26<1:00:04,  3.56it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.3676312267780304, 'eval_runtime': 56.2045, 'eval_samples_per_second': 722.166, 'eval_steps_per_second': 45.139, 'epoch': 1.0}


 34%|███▍      | 6500/19224 [31:47<58:40,  3.61it/s]   
 34%|███▍      | 6500/19224 [31:47<58:40,  3.61it/s]     

{'loss': 1.7741, 'grad_norm': 9.482024192810059, 'learning_rate': 1.3243861839367459e-05, 'epoch': 1.01}


 36%|███▋      | 7000/19224 [34:08<56:35,  3.60it/s]  
 36%|███▋      | 7000/19224 [34:08<56:35,  3.60it/s]     

{'loss': 1.7436, 'grad_norm': 18.12093162536621, 'learning_rate': 1.272367873491469e-05, 'epoch': 1.09}


 39%|███▉      | 7500/19224 [36:29<54:22,  3.59it/s]  
 39%|███▉      | 7500/19224 [36:29<54:22,  3.59it/s]     

{'loss': 1.7299, 'grad_norm': 24.29546356201172, 'learning_rate': 1.2203495630461924e-05, 'epoch': 1.17}


 42%|████▏     | 8000/19224 [38:51<52:03,  3.59it/s]  
 42%|████▏     | 8000/19224 [38:51<52:03,  3.59it/s]     

{'loss': 1.6808, 'grad_norm': 7.62349796295166, 'learning_rate': 1.1683312526009156e-05, 'epoch': 1.25}


 44%|████▍     | 8500/19224 [41:12<49:39,  3.60it/s]  
 44%|████▍     | 8500/19224 [41:12<49:39,  3.60it/s]     

{'loss': 1.6912, 'grad_norm': 14.364747047424316, 'learning_rate': 1.116312942155639e-05, 'epoch': 1.33}


 47%|████▋     | 9000/19224 [43:32<47:31,  3.59it/s]  
 47%|████▋     | 9000/19224 [43:32<47:31,  3.59it/s]     

{'loss': 1.675, 'grad_norm': 26.476177215576172, 'learning_rate': 1.0643986683312526e-05, 'epoch': 1.4}


 49%|████▉     | 9500/19224 [45:53<45:07,  3.59it/s]  
 49%|████▉     | 9500/19224 [45:53<45:07,  3.59it/s]     

{'loss': 1.662, 'grad_norm': 25.698244094848633, 'learning_rate': 1.012380357885976e-05, 'epoch': 1.48}


 52%|█████▏    | 10000/19224 [48:14<42:42,  3.60it/s] 
 52%|█████▏    | 10000/19224 [48:14<42:42,  3.60it/s]    

{'loss': 1.6308, 'grad_norm': 16.25322723388672, 'learning_rate': 9.603620474406991e-06, 'epoch': 1.56}


 55%|█████▍    | 10500/19224 [50:34<40:30,  3.59it/s]  
 55%|█████▍    | 10500/19224 [50:34<40:30,  3.59it/s]    

{'loss': 1.5922, 'grad_norm': 15.065494537353516, 'learning_rate': 9.083437369954225e-06, 'epoch': 1.64}


 57%|█████▋    | 11000/19224 [52:55<38:07,  3.59it/s]  
 57%|█████▋    | 11000/19224 [52:55<38:07,  3.59it/s]    

{'loss': 1.6008, 'grad_norm': 29.312358856201172, 'learning_rate': 8.564294631710362e-06, 'epoch': 1.72}


 60%|█████▉    | 11500/19224 [55:16<35:48,  3.60it/s]  
 60%|█████▉    | 11500/19224 [55:16<35:48,  3.60it/s]    

{'loss': 1.6296, 'grad_norm': 35.26564025878906, 'learning_rate': 8.044111527257596e-06, 'epoch': 1.79}


 62%|██████▏   | 12000/19224 [57:37<33:26,  3.60it/s]  
 62%|██████▏   | 12000/19224 [57:37<33:26,  3.60it/s]    

{'loss': 1.561, 'grad_norm': 24.383808135986328, 'learning_rate': 7.523928422804828e-06, 'epoch': 1.87}


 65%|██████▌   | 12500/19224 [59:59<31:23,  3.57it/s]  
 65%|██████▌   | 12500/19224 [59:59<31:23,  3.57it/s]    

{'loss': 1.5541, 'grad_norm': 19.161354064941406, 'learning_rate': 7.003745318352061e-06, 'epoch': 1.95}


 67%|██████▋   | 12816/19224 [1:01:29<31:04,  3.44it/s]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

{'eval_loss': 0.36801645159721375, 'eval_runtime': 57.4152, 'eval_samples_per_second': 706.938, 'eval_steps_per_second': 44.187, 'epoch': 2.0}


 68%|██████▊   | 13000/19224 [1:03:18<29:03,  3.57it/s]   
 68%|██████▊   | 13000/19224 [1:03:18<29:03,  3.57it/s]  

{'loss': 1.5471, 'grad_norm': 35.916587829589844, 'learning_rate': 6.484602580108199e-06, 'epoch': 2.03}


 70%|███████   | 13500/19224 [1:05:41<26:35,  3.59it/s]  
 70%|███████   | 13500/19224 [1:05:41<26:35,  3.59it/s]  

{'loss': 1.4794, 'grad_norm': 37.097137451171875, 'learning_rate': 5.964419475655431e-06, 'epoch': 2.11}


 73%|███████▎  | 14000/19224 [1:08:03<24:23,  3.57it/s]  
 73%|███████▎  | 14000/19224 [1:08:03<24:23,  3.57it/s]  

{'loss': 1.4701, 'grad_norm': 11.921234130859375, 'learning_rate': 5.444236371202664e-06, 'epoch': 2.18}


 75%|███████▌  | 14500/19224 [1:10:25<22:01,  3.57it/s]  
 75%|███████▌  | 14500/19224 [1:10:25<22:01,  3.57it/s]  

{'loss': 1.4483, 'grad_norm': 27.994264602661133, 'learning_rate': 4.9240532667498966e-06, 'epoch': 2.26}


 78%|███████▊  | 15000/19224 [1:12:47<19:36,  3.59it/s]  
 78%|███████▊  | 15000/19224 [1:12:47<19:36,  3.59it/s]  

{'loss': 1.4493, 'grad_norm': 14.700908660888672, 'learning_rate': 4.4049105285060345e-06, 'epoch': 2.34}


 81%|████████  | 15500/19224 [1:15:08<17:17,  3.59it/s]
 81%|████████  | 15500/19224 [1:15:08<17:17,  3.59it/s]  

{'loss': 1.4253, 'grad_norm': 15.759981155395508, 'learning_rate': 3.884727424053267e-06, 'epoch': 2.42}


 83%|████████▎ | 16000/19224 [1:17:29<14:58,  3.59it/s]
 83%|████████▎ | 16000/19224 [1:17:29<14:58,  3.59it/s]  

{'loss': 1.3993, 'grad_norm': 22.33134651184082, 'learning_rate': 3.3645443196004994e-06, 'epoch': 2.5}


 86%|████████▌ | 16500/19224 [1:19:50<12:40,  3.58it/s]
 86%|████████▌ | 16500/19224 [1:19:50<12:40,  3.58it/s]  

{'loss': 1.4195, 'grad_norm': 29.72951889038086, 'learning_rate': 2.844361215147732e-06, 'epoch': 2.57}


 88%|████████▊ | 17000/19224 [1:22:11<10:19,  3.59it/s]
 88%|████████▊ | 17000/19224 [1:22:11<10:19,  3.59it/s]  

{'loss': 1.4195, 'grad_norm': 17.338481903076172, 'learning_rate': 2.324178110694965e-06, 'epoch': 2.65}


 91%|█████████ | 17500/19224 [1:24:31<08:01,  3.58it/s]
 91%|█████████ | 17500/19224 [1:24:32<08:01,  3.58it/s]  

{'loss': 1.3678, 'grad_norm': 20.542327880859375, 'learning_rate': 1.805035372451103e-06, 'epoch': 2.73}


 94%|█████████▎| 18000/19224 [1:26:52<05:41,  3.58it/s]
 94%|█████████▎| 18000/19224 [1:26:53<05:41,  3.58it/s]  

{'loss': 1.3874, 'grad_norm': 31.537296295166016, 'learning_rate': 1.2848522679983354e-06, 'epoch': 2.81}


 96%|█████████▌| 18500/19224 [1:29:14<03:23,  3.55it/s]
 96%|█████████▌| 18500/19224 [1:29:14<03:23,  3.55it/s]  

{'loss': 1.379, 'grad_norm': 14.20889949798584, 'learning_rate': 7.646691635455682e-07, 'epoch': 2.89}


 99%|█████████▉| 19000/19224 [1:31:30<01:00,  3.73it/s]
 99%|█████████▉| 19000/19224 [1:31:30<01:00,  3.73it/s]  

{'loss': 1.3727, 'grad_norm': 29.635896682739258, 'learning_rate': 2.444860590928007e-07, 'epoch': 2.96}


100%|██████████| 19224/19224 [1:32:33<00:00,  3.65it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


{'eval_loss': 0.3678590953350067, 'eval_runtime': 55.7158, 'eval_samples_per_second': 728.501, 'eval_steps_per_second': 45.535, 'epoch': 3.0}
{'train_runtime': 5611.827, 'train_samples_per_second': 219.241, 'train_steps_per_second': 3.426, 'train_loss': 1.6693153921088426, 'epoch': 3.0}


('./saved_model-v4\\tokenizer_config.json',
 './saved_model-v4\\special_tokens_map.json',
 './saved_model-v4\\vocab.txt',
 './saved_model-v4\\added_tokens.json',
 './saved_model-v4\\tokenizer.json')

In [None]:

from sklearn.metrics import classification_report, confusion_matrix

predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

report = classification_report(labels, preds, target_names=['No Diabetes', 'Pre-Diabetic', 'Diabetic'])
matrix = confusion_matrix(labels, preds)

print(report)
print(matrix)


100%|██████████| 3171/3171 [01:10<00:00, 44.79it/s]

              precision    recall  f1-score   support

 No Diabetes       0.92      0.85      0.88     42795
Pre-Diabetic       0.00      0.00      0.00       944
    Diabetic       0.38      0.61      0.47      6997

    accuracy                           0.80     50736
   macro avg       0.43      0.49      0.45     50736
weighted avg       0.83      0.80      0.81     50736

[[36291     0  6504]
 [  552     0   392]
 [ 2702     0  4295]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
