In [None]:
import tensorflow as tf
import torch
from transformers import TFAutoModelForTokenClassification, BertTokenizer
#import nltk
#from nltk.tokenize import word_tokenize
#nltk.download('punkt

import pandas as pd 
import keras
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = TFAutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=5)

In [None]:
label_encoder = LabelEncoder()

In [None]:
df = pd.read_csv(f'/home/rpierson/Topic_Files/topic_0.csv')
df['Priority'] = label_encoder.fit_transform(df['Priority'])

for x in range(len(df)):
    if pd.isna(df.iloc[x]["Combined_Text"]):
        df.at[x, "Combined_Text"] = " "
        
count = 0
for x in range(len(df)):
    if pd.isna(df.iloc[x]["Combined_Text"]):
        count += count
        
count

In [None]:
df.tail()

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
import re
def tokenize_function(example):
    return tokenizer(example["Combined_Text"], padding="max_length", truncation=True)
def remove_links(text):
    return re.sub(r'http\S+', '', text)

In [None]:
train_df['Combined_Text'] = train_df['Combined_Text'].apply(remove_links)
val_df['Combined_Text'] = val_df['Combined_Text'].apply(remove_links)
                                                        
                                                        

In [None]:

train_df['Combined_Text'] = tf.convert_to_tensor(train_df['Combined_Text'], dtype = tf.string)
train_df['Priority'] = tf.convert_to_tensor(train_df['Priority'], dtype = tf.int32)

val_df['Combined_Text'] = tf.convert_to_tensor(val_df['Combined_Text'], dtype = tf.string)
val_df['Priority'] = tf.convert_to_tensor(val_df['Priority'], dtype = tf.int32)

In [None]:
train_df

In [None]:
val_df

In [None]:
#to tensorflow dataset


def df_to_dataset(df, label_column, shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.pop(label_column)
    
    # Convert tokenized text to strings
    df['Combined_Text'] = df['Combined_Text'].apply(lambda x: ' '.join(x))
    
    # Convert each column to a tensor with appropriate dtype
    for col in df.columns:
        if col == 'Combined_Text':
            df[col] = df[col].apply(lambda x: tf.convert_to_tensor(str(x), dtype=tf.string))
        else:
            df[col] = df[col].apply(lambda x: tf.convert_to_tensor(x, dtype=tf.int32))
    
    # Create a dataset from the dictionary of tensors
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    
    return ds


In [None]:
train_dataset = df_to_dataset(train_df, 'Priority')
for batch in train_dataset.take(1):
    print(batch)
val_dataset = df_to_dataset(val_df, 'Priority', shuffle=False)

In [None]:
train_tokenized = train_df["Combined_Text"].apply(lambda x: tokenize_function({"Combined_Text": x}))
val_tokenized = val_df["Combined_Text"].apply(lambda x: tokenize_function({"Combined_Text": x}))

train_df["input_ids"] = train_tokenized.apply(lambda x: x["input_ids"])
train_df["attention_mask"] = train_tokenized.apply(lambda x: x["attention_mask"])
train_df["token_type_ids"] = train_tokenized.apply(lambda x: x["token_type_ids"])

val_df["input_ids"] = val_tokenized.apply(lambda x: x["input_ids"])
val_df["attention_mask"] = val_tokenized.apply(lambda x: x["attention_mask"])
val_df["token_type_ids"] = val_tokenized.apply(lambda x: x["token_type_ids"])

In [None]:
def one_hot_encode_label(features, label):
    num_classes = 5  # Change this to your actual number of classes
    one_hot_label = tf.one_hot(label, depth=num_classes)
    return features, one_hot_label

In [None]:
train_dataset = train_df.map(one_hot_encode_label)
val_dataset = val_df.map(one_hot_encode_label)

In [None]:
train_dataset = train_df.shuffle(len(train_df)).batch(16)
val_dataset = val_df.batch(16)

In [None]:
optimizer = tf.keras.optimizers.Ftrl(learning_rate=0.01)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [None]:
if torch.cuda.is_available():
    print("CUDA is available. Number of GPUs:", torch.cuda.device_count())
    print("CUDA device name:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")
torch.cuda.set_device(torch.device("cuda:0"))

In [None]:
if torch.cuda.is_available():
        print(f"LOG: Torch allocated Memory: \
                {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB", flush=True)
        print(f"LOG: Torch cached Memory: \
                {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB", flush=True)

if tf.config.experimental.list_physical_devices('GPU'):
    zero_info = tf.config.experimental.get_memory_info('GPU:0')
    print(f"LOG: TensorFlow, 0: Current memory usage: \
            {zero_info['current'] / 1024 ** 2:.2f} MB", flush=True)
    print(f"LOG: TensorFlow, 0: Peak memory usage: \
            {zero_info['peak'] / 1024 ** 2:.2f} MB", flush=True)

In [None]:
# TensorFlow GPU configuration
#physical_devices = tf.config.experimental.list_physical_devices('GPU')
#if physical_devices:
#    try:
#        for device in physical_devices:
#            tf.config.experimental.set_memory_growth(device, True)
#        print(f"LOG: TensorFlow GPU devices: {physical_devices}.", flush=True)
#    except RuntimeError as exception:
#        print(f"LOG: TensorFlow GPU devices: {physical_devices}.", flush=True)
#        print(f"LOG: TensorFlow GPU configuration error: {exception}", flush=True)
#else:
#    print("ERROR: No TensorFlow GPU devices found.", flush=True)

In [None]:
strategy = tf.distribute.MirroredStrategy()

In [None]:
if torch.cuda.is_available():
        print(f"LOG: Torch allocated Memory: \
                {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB", flush=True)
        print(f"LOG: Torch cached Memory: \
                {torch.cuda.memory_reserved() / 1024 ** 2:.2f} MB", flush=True)

if tf.config.experimental.list_physical_devices('GPU'):
    zero_info = tf.config.experimental.get_memory_info('GPU:0')
    print(f"LOG: TensorFlow, 0: Current memory usage: \
            {zero_info['current'] / 1024 ** 2:.2f} MB", flush=True)
    print(f"LOG: TensorFlow, 0: Peak memory usage: \
            {zero_info['peak'] / 1024 ** 2:.2f} MB", flush=True)

from tensorflow.keras.optimizers import Adam
# model.compile(optimizer=Adam(3e-5))  # No loss argument!

model.compile(optimizer='Ftrl', loss='sparse_categorical_crossentropy', metrics = ['Accuracy', 'Precision', 'Recall'])


In [None]:
#model.add(layers.Dense(5, activation='softmax'))

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df['Priority']
)).shuffle(len(train_df)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_df['Priority']
)).batch(16)

In [None]:
model.fit(
    train_df,
    validation_data=val_df,
    epochs=15
)