In [None]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel, DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.layers import Input, Dense, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

def get_available_memory():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        details = tf.config.experimental.get_device_details(gpus[0])
        return details.get('memory_limit', 0) / (1024 ** 2)
    return 8000  

def get_dynamic_batch_size():
    free_mem = get_available_memory()
    return 16 if free_mem > 16000 else 8 if free_mem > 8000 else 4 if free_mem > 4000 else 2

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

tf.config.optimizer.set_jit(False)
BATCH_SIZE = get_dynamic_batch_size()
MAX_LENGTH = 96

data = pd.read_csv('datasetofsenti.csv').drop(columns=["Unnamed: 0"], errors="ignore")
data['text'] = data['text'].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", str(x).lower().strip()))

train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["text"], data["label"], test_size=0.375, random_state=42
)

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_test = label_encoder.transform(test_labels)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

def tokenize(texts):
    return {
        'roberta_input_ids': roberta_tokenizer(
            texts.tolist(), max_length=MAX_LENGTH, padding='max_length', truncation=True, return_tensors='tf'
        )['input_ids'],
        'distilbert_input_ids': distilbert_tokenizer(
            texts.tolist(), max_length=MAX_LENGTH, padding='max_length', truncation=True, return_tensors='tf'
        )['input_ids']
    }

def create_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        {'roberta_input': encodings['roberta_input_ids'],
         'distilbert_input': encodings['distilbert_input_ids']},
        labels
    )).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE).cache()

train_dataset = create_dataset(tokenize(train_texts), y_train)
test_dataset = create_dataset(tokenize(test_texts), y_test)

def build_hybrid_model(num_classes):
    roberta_input = Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='roberta_input')
    distilbert_input = Input(shape=(MAX_LENGTH,), dtype=tf.int32, name='distilbert_input')

    roberta_out = roberta_model(roberta_input, training=True).last_hidden_state[:, 0, :]
    distilbert_out = distilbert_model(distilbert_input, training=True).last_hidden_state[:, 0, :]
    
    combined = Attention()([tf.expand_dims(roberta_out, 1), tf.expand_dims(distilbert_out, 1)])
    combined = tf.squeeze(combined, axis=1)
    outputs = Dense(num_classes, activation='softmax')(combined)
    
    return Model(inputs=[roberta_input, distilbert_input], outputs=outputs)

model = build_hybrid_model(len(label_encoder.classes_))
model.compile(optimizer=Adam(2e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    callbacks=[
        EarlyStopping(patience=3),
        ModelCheckpoint('best_model.keras', save_best_only=True)
    ],
    steps_per_epoch=len(train_texts) // (BATCH_SIZE * 4)
)

model.load_weights('best_model.keras')
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 

Epoch 1/10