# Import

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.4f' % x)

PROJECT_DIR = os.path.join(os.path.dirname('bert_modeling.ipynb'), os.pardir) + '/..'

In [2]:
# Google Colab setup
from google.colab import drive, userdata
drive.mount('/content/drive/')
PROJECT_DIR = userdata.get('PROJECT_DIR')

Mounted at /content/drive/


In [3]:
# Reading the up-sampled dataset
up_train = pd.read_csv(f'{PROJECT_DIR}/data/processed/augmented_train.csv', usecols=['Description', 'Accident Level'])
test = pd.read_csv(f'{PROJECT_DIR}/data/processed/augmented_test.csv', usecols=['Description', 'Accident Level'])
up_train.head()

Unnamed: 0,Description,Accident Level
0,By manually moving a steel cabinet for disposa...,3
1,Once the mooring of the faneles in the detonat...,1
2,When performing cleaning activity of the area ...,3
3,The technician was doing the magnetometric sur...,1
4,The operator cleaned with spatula spear throug...,1


In [4]:
# Splitting features and targets
x_train = up_train['Description'].str.strip()
y_train = up_train['Accident Level']

x_test = test['Description'].str.strip()
y_test = test['Accident Level']

In [17]:
# Label encoding to ensure the labels start from 0
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Tokenization

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(texts, labels, max_length=128):
    input_ids, attention_masks = [], []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf',
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_masks), tf.convert_to_tensor(labels)


In [None]:
input_ids, attention_masks, labels = encode_data(x_train, y_train)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
dataset = tf.data.Dataset.from_tensor_slices((input_ids, attention_masks, labels))
dataset = dataset.shuffle(buffer_size=1024).batch(8)


In [None]:
dataset

<BatchDataset element_spec=(TensorSpec(shape=(None, 1, 128), dtype=tf.int32, name=None), TensorSpec(shape=(None, 1, 128), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

input_id = tf.keras.Input(shape=(1, 128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.Input(shape=(1, 128,), dtype=tf.int32, name='attention_masks')

bert_output = bert_model(input_id, attention_mask=attention_mask)[0]
cls_token = bert_output[:, 0, :]
output = tf.keras.layers.Dense(5, activation='softmax')(cls_token)

model = tf.keras.Model(inputs=[input_id, attention_mask], outputs=output)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer "tf_bert_model_1" (type TFBertModel).

in user code:

    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\transformers\modeling_tf_utils.py", line 1183, in run_call_with_unpacked_inputs  *
        return func(self, **unpacked_inputs)
    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 1210, in call  *
        outputs = self.bert(
    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\suhai\AppData\Local\Temp\__autograph_generated_fileqoedp6rd.py", line 37, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "C:\Users\suhai\AppData\Local\Temp\__autograph_generated_fileq07un90_.py", line 76, in tf__call
        (batch_size, seq_length) = ag__.ld(input_shape)

    ValueError: Exception encountered when calling layer "bert" "                 f"(type TFBertMainLayer).
    
    in user code:
    
        File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\transformers\modeling_tf_utils.py", line 1183, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 874, in call  *
            batch_size, seq_length = input_shape
    
        ValueError: too many values to unpack (expected 2)
    
    
    Call arguments received by layer "bert" "                 f"(type TFBertMainLayer):
      • self=tf.Tensor(shape=(None, 1, 128), dtype=int32)
      • input_ids=None
      • attention_mask=tf.Tensor(shape=(None, 1, 128), dtype=int32)
      • token_type_ids=None
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • encoder_hidden_states=None
      • encoder_attention_mask=None
      • past_key_values=None
      • use_cache=True
      • output_attentions=False
      • output_hidden_states=False
      • return_dict=True
      • training=False


Call arguments received by layer "tf_bert_model_1" (type TFBertModel):
  • self=tf.Tensor(shape=(None, 1, 128), dtype=int32)
  • input_ids=None
  • attention_mask=tf.Tensor(shape=(None, 1, 128), dtype=int32)
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='categorical_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])

history = model.fit([input_ids, attention_masks], epochs=3, verbose=1)


Epoch 1/3


ValueError: in user code:

    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\suhai\anaconda3\envs\gpu\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model" is incompatible with the layer: expected shape=(None, 128), found shape=(None, 1, 128)


In [None]:
dataset

<BatchDataset element_spec=(TensorSpec(shape=(None, 1, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None, 1, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

# Load your dataset
# Replace this with your actual dataset
data = pd.read_csv('your_dataset.csv')  # Assuming a CSV file with 'text' and 'label' columns

# Preprocessing
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['text'].tolist(),
    data['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Evaluate the model
results = model.evaluate(val_dataset)
print("Validation Loss:", results[0])
print("Validation Accuracy:", results[1])

# Make predictions
predictions = model.predict(val_dataset)
pred_labels = tf.argmax(predictions.logits, axis=1).numpy()

# Print classification report
print(classification_report(val_labels, pred_labels, target_names=label_encoder.classes_))


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(y_train)
val_labels = label_encoder.transform(y_test)

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization
def tokenize(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='tf')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenize(x_train.to_list())
val_encodings = tokenize(x_test.to_list())

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [None]:
# Tokenize the datasets
train_dataset = train_dataset.map(lambda x, y: tokenize(x, y), num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(lambda x, y: tokenize(x, y), num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# Batch and shuffle the datasets
batch_size = 8
train_dataset = train_dataset.shuffle(len(x_train.to_list())).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

In [None]:
# Convert the features into the expected input format for the model
def format_dataset(features, labels):
    return {
        'input_ids': features['input_ids'],
        'attention_mask': features['attention_mask'],
        'token_type_ids': features['token_type_ids']
    }, labels

train_dataset = train_dataset.map(lambda x, y: format_dataset(x, y), num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(lambda x, y: format_dataset(x, y), num_parallel_calls=tf.data.AUTOTUNE)

In [None]:
# Load BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=model.compute_loss,  # Automatically compute the correct loss for classification
              metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=3)

Epoch 1/3


AttributeError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1370, in run_step  *
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1706, in train_step  *
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/losses.py", line 143, in __call__  *
        losses = call_fn(y_true, y_pred)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/losses.py", line 270, in call  *
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_tf_utils.py", line 1588, in compute_loss  *
        return super().compute_loss(*args, **kwargs)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/training.py", line 1207, in compute_loss  *
        y, y_pred, sample_weight, regularization_losses=self.losses
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/compile_utils.py", line 275, in __call__  *
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "/usr/local/lib/python3.10/dist-packages/tf_keras/src/engine/compile_utils.py", line 854, in match_dtype_and_rank  *
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'


# Torch

In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.9.5-cp310-cp310-win_amd64.whl.metadata (7.7 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist>=1.1.1 (from 

In [None]:
!SET TORCH_USE_CUDA_DSA=1

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

# Load your dataset
# Replace this with your actual dataset
# data = pd.read_csv('your_dataset.csv')  # Assuming a CSV file with 'text' and 'label' columns

# # Preprocessing
# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     data['text'].tolist(),
#     data['label'].tolist(),
#     test_size=0.2,
#     random_state=42
# )

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=128)

train_encodings = tokenize_function(x_train.to_list())
val_encodings = tokenize_function(x_test.to_list())

# Convert to datasets
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train.to_list()
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': y_test.to_list()
})

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(y_train.to_list())))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions.argmax(-1))}
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()

# Print results
print("Validation Accuracy:", results['eval_accuracy'])

# Make predictions
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

# Print classification report
print(classification_report(y_test.to_list(), pred_labels))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Sample data
texts = x_train.to_list()
labels = y_train.to_list()  # Binary labels for simplicity, replace with your own labels

# Hyperparameters
MAX_LEN = 32
BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 2e-5

# Tokenizer and encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels)

class TextDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Encode data
train_input_ids, train_attention_mask, train_labels = encode_data(train_texts, train_labels)
val_input_ids, val_attention_mask, val_labels = encode_data(val_texts, val_labels)

# Create DataLoader
train_dataset = TextDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = TextDataset(val_input_ids, val_attention_mask, val_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

def eval_model(model, data_loader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return accuracy_score(true_labels, preds)

# Train and evaluate
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer)
    print(f"Train loss: {train_loss}")
    val_accuracy = eval_model(model, val_loader)
    print(f"Validation accuracy: {val_accuracy}")

print("Training complete!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
os.environ['TORCH_USE_CUDA_DSA'] = str(1)

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Sample data
texts = x_train.to_list()
labels = y_train.to_list()  # Binary labels for simplicity

# Hyperparameters
MAX_LEN = 32
BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 2e-5

# Tokenizer and encoding
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask'], torch.tensor(labels)

class TextDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Encode data
train_input_ids, train_attention_mask, train_labels = encode_data(train_texts, train_labels)
val_input_ids, val_attention_mask, val_labels = encode_data(val_texts, val_labels)

# Create DataLoader
train_dataset = TextDataset(train_input_ids, train_attention_mask, train_labels)
val_dataset = TextDataset(val_input_ids, val_attention_mask, val_labels)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Model
num_labels = len(set(labels))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader)

def eval_model(model, data_loader):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return accuracy_score(true_labels, preds)

# Training and evaluation
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    try:
        train_loss = train_epoch(model, train_loader, optimizer)
        print(f"Train loss: {train_loss}")
    except Exception as e:
        print(f"Error during training: {e}")

    try:
        val_accuracy = eval_model(model, val_loader)
        print(f"Validation accuracy: {val_accuracy}")
    except Exception as e:
        print(f"Error during evaluation: {e}")

print("Training complete!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training:   1%|          | 4/462 [00:09<17:28,  2.29s/it]


Error during training: Target 5 is out of bounds.


Evaluating: 100%|██████████| 116/116 [00:16<00:00,  7.21it/s]


Validation accuracy: 0.20346320346320346
Epoch 2/3


Training:   1%|          | 5/462 [00:07<10:40,  1.40s/it]


Error during training: Target 5 is out of bounds.


Evaluating: 100%|██████████| 116/116 [00:14<00:00,  7.89it/s]


Validation accuracy: 0.19480519480519481
Epoch 3/3


Training:   1%|          | 3/462 [00:03<09:21,  1.22s/it]


Error during training: Target 5 is out of bounds.


Evaluating: 100%|██████████| 116/116 [00:14<00:00,  7.97it/s]

Validation accuracy: 0.2077922077922078
Training complete!





In [None]:
!python3.10 setup.py build --with-cuda --torch_use_cuda_dsa

'python3.10' is not recognized as an internal or external command,
operable program or batch file.


# TF Hub

In [5]:
# A dependency of the preprocessing for BERT inputs
!pip install -U "tensorflow-text==2.15.*"

Collecting tensorflow-text==2.15.*
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-text
Successfully installed tensorflow-text-2.15.0


In [6]:
# To define AdamW Optimizer
!pip install "tf-models-official==2.15.*"

Collecting tf-models-official==2.15.*
  Downloading tf_models_official-2.15.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting sacrebleu (from tf-models-official==2.15.*)
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from tf-models-official==2.15.*)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorflow-model-optimization>=0.4.1 (from tf-models-official==2.15.*)
  Downloading tensorflow_model_optimization-0.8.0-py2.py3-none-any.whl.metadata (904 bytes)
Collecting portalocker (from sacrebleu->tf-models-official==2.15.*)
  Downloading

In [7]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [18]:
from tensorflow.data import Dataset
from tensorflow.keras.utils import to_categorical
# up_train.rename(columns={'Accident Level': 'labels'}, inplace=True)
# test.rename(columns={'Accident Level': 'labels'}, inplace=True)
y_train_ohe = to_categorical(y_train)
y_test_ohe = to_categorical(y_test)
train_ds = Dataset.from_tensor_slices((x_train, y_train_ohe))
test_ds = Dataset.from_tensor_slices((x_test, y_test_ohe))

In [10]:
from tensorflow import string
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow_hub import KerasLayer

In [26]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

def adamw():
    epochs = 5
    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = 3e-5
    return tf.keras.optimizers.AdamW( # Use tf.keras.optimizers.AdamW directly
        learning_rate=init_lr,
        weight_decay=0.001, # Add weight decay if needed
        epsilon=1e-07
    )

def build_bert():
    text_input = Input(shape=(), dtype=string, name='Description')
    bert_preprocessing = KerasLayer(tfhub_handle_preprocess, name='preprocessing')(text_input)
    bert_encoder = KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')(bert_preprocessing)

    dropout_1 = Dropout(0.5)(bert_encoder['pooled_output'])
    classifier = Dense(5, activation='softmax', name='classifier')(dropout_1)

    model = Model(text_input, classifier)

    model.compile(
        loss='categorical_crossentropy',
        optimizer=adamw(),
        metrics=['accuracy', Recall(), Precision()]
    )

    return model

In [27]:
bert = build_bert()
bert.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Description (InputLayer)    [(None,)]                    0         []                            
                                                                                                  
 preprocessing (KerasLayer)  {'input_word_ids': (None,    0         ['Description[0][0]']         
                             128),                                                                
                              'input_type_ids': (None,                                            
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             }                                                              

In [29]:
history = bert.fit(train_ds.batch(32), validation_data=test_ds.batch(32), epochs=5)

Epoch 1/5


ValueError: Creating variables on a non-first call to a function decorated with tf.function.

In [24]:
tf.data.experimental.cardinality(train_ds.batch(32)).numpy()

37