In [1]:
!export CUDA_VISIBLE_DEVICES=5

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [3]:
import tensorflow as tf
from transformers import BertTokenizerFast, TFAutoModelForTokenClassification
import pandas as pd 
import torch
from sklearn.model_selection import train_test_split
import numpy as np
import re

2024-07-12 20:30:27.656296: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 20:30:27.687223: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 20:30:27.687254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-12 20:30:27.707061: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(


In [4]:
if torch.cuda.is_available():
    print("CUDA is available. Number of GPUs:", torch.cuda.device_count())
    print("CUDA device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")
torch.cuda.set_device(torch.device("cuda:0"))

CUDA is available. Number of GPUs: 1
CUDA device name: NVIDIA GeForce GTX 1080 Ti


In [5]:
tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
model = TFAutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=5)

  _torch_pytree._register_pytree_node(
2024-07-12 20:30:32.180437: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10532 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080 Ti, pci bus id: 0000:85:00.0, compute capability: 6.1
All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
#optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
#loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]


In [7]:
strategy = tf.distribute.MirroredStrategy()
if torch.cuda.is_available():
        print(f"LOG: Torch allocated Memory: \
                {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB", flush=True)
        print(f"LOG: Torch cached Memory: \
                {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB", flush=True)

if tf.config.experimental.list_physical_devices('GPU'):
    zero_info = tf.config.experimental.get_memory_info('GPU:0')
    print(f"LOG: TensorFlow, 0: Current memory usage: \
            {zero_info['current'] / 1024 ** 2:.2f} MB", flush=True)
    print(f"LOG: TensorFlow, 0: Peak memory usage: \
            {zero_info['peak'] / 1024 ** 2:.2f} MB", flush=True)

# model.compile(optimizer=Adam(3e-5))  # No loss argument!

model.compile(optimizer='Ftrl', loss='sparse_categorical_crossentropy', metrics = ['Accuracy', 'Precision', 'Recall'])

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
LOG: Torch allocated Memory:                 0.00 MB
LOG: Torch cached Memory:                 0.00 MB
LOG: TensorFlow, 0: Current memory usage:             416.42 MB
LOG: TensorFlow, 0: Peak memory usage:             505.58 MB


In [8]:
#Now working with the text

In [9]:
df = pd.read_csv(f'/home/rpierson/Topic_Files/topic_0.csv')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
max_length = 512
def tokenize_function(text):
    return tokenizer(text, padding= "max_length", truncation=True)


In [11]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [12]:
# Apply the tokenize function to the 'Combined_Text' column
#for x in range(len(train_df)):
#    train_df.loc[x, 'Combined_Text'] = tokenize_function(train_df.loc[x, 'Combined_Text'])


#train_df['Combined_Text'] = train_df['Combined_Text'].apply(tokenize_function)
#val_df['Combined_Text'] = val_df['Combined_Text'].apply(tokenize_function)

In [13]:
train_encodings = tokenizer(
    train_df['Combined_Text'].tolist(), 
    truncation=True, 
    padding=True, 
    max_length=512
)

val_encodings = tokenizer(
    val_df['Combined_Text'].tolist(), 
    truncation=True, 
    padding=True, 
    max_length=512
)


In [14]:
train_inputs = {
    'input_ids': tf.constant(train_encodings['input_ids']),
    'attention_mask': tf.constant(train_encodings['attention_mask'])
}

val_inputs = {
    'input_ids': tf.constant(val_encodings['input_ids']),
    'attention_mask': tf.constant(val_encodings['attention_mask'])
}


In [15]:
train_labels = tf.constant(train_df['Priority'].values)  # Shape (num_samples,)
val_labels = tf.constant(val_df['Priority'].values) 

In [16]:
print("Train input_ids shape:", tf.constant(train_encodings['input_ids']).shape)
print("Train attention_mask shape:", tf.constant(train_encodings['attention_mask']).shape)
print("Val input_ids shape:", tf.constant(val_encodings['input_ids']).shape)
print("Val attention_mask shape:", tf.constant(val_encodings['attention_mask']).shape)

Train input_ids shape: (824, 512)
Train attention_mask shape: (824, 512)
Val input_ids shape: (206, 512)
Val attention_mask shape: (206, 512)


In [17]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_encodings, train_labels)).shuffle(len(train_df)).batch(15)
val_dataset = tf.data.Dataset.from_tensor_slices((val_encodings, val_labels)).batch(15)


In [18]:
# Create TensorFlow datasets
#train_dataset = tf.data.Dataset.from_tensor_slices((
#    dict(train_encodings),
#    train_df['Priority']
#)).shuffle(len(train_df)).batch(15)

#val_dataset = tf.data.Dataset.from_tensor_slices((
#    dict(val_encodings),
#    val_df['Priority']
#)).batch(15)

In [19]:
for inputs, labels in train_dataset.take(1):
    print("Train batch input_ids shape:", inputs['input_ids'].shape)
    print("Train batch attention_mask shape:", inputs['attention_mask'].shape)
    print("Train batch labels shape:", labels.shape)

for inputs, labels in val_dataset.take(1):
    print("Val batch input_ids shape:", inputs['input_ids'].shape)
    print("Val batch attention_mask shape:", inputs['attention_mask'].shape)
    print("Val batch labels shape:", labels.shape)

Train batch input_ids shape: (15, 512)
Train batch attention_mask shape: (15, 512)
Train batch labels shape: (15,)
Val batch input_ids shape: (15, 512)
Val batch attention_mask shape: (15, 512)
Val batch labels shape: (15,)


2024-07-12 20:30:36.532106: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-12 20:30:36.542468: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [20]:
for inputs, labels in train_dataset.take(1):
    outputs = model(inputs)
    print("Model output shape:", outputs.logits.shape)
    print("Expected labels shape:", labels.shape)

Model output shape: (15, 512, 5)
Expected labels shape: (15,)


2024-07-12 20:30:37.528879: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [21]:
!export TF_GPU_ALLOCATOR=cuda_malloc_async

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
model = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=15  # Adjust the number of epochs based on your needs
)

Epoch 1/15
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


ValueError: in user code:

    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1370, in run_step  *
        outputs = model.train_step(data)
    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 1643, in train_step  *
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/tf_keras/src/engine/compile_utils.py", line 620, in update_state  *
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/tf_keras/src/metrics/base_metric.py", line 153, in decorated  *
        result = update_state_fn(*args, **kwargs)
    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/tf_keras/src/metrics/base_metric.py", line 140, in update_state_fn  *
        return ag_update_state(*args, **kwargs)
    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/tf_keras/src/metrics/confusion_metrics.py", line 481, in update_state  *
        sample_weight=sample_weight,
    File "/home/rpierson/anaconda3/envs/secondenvi/lib/python3.11/site-packages/tf_keras/src/utils/metrics_utils.py", line 672, in update_confusion_matrix_variables  *
        y_pred.shape.assert_is_compatible_with(y_true.shape)

    ValueError: Shapes (None, 512, 5) and (None,) are incompatible
