In [1]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from transformers import RobertaTokenizer, RobertaConfig, TFRobertaModel, CLIPTextModelWithProjection, AutoTokenizer
from sklearn.model_selection import train_test_split
import random
import os
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# Limit GPU memory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

base_folder = "../dataset/"
input_file = "hate_int_prof_SVO.tsv"

output_folder = "output_weights/"
output_file = "output_weights"

roberta_model = "roberta-base"
clip_model = "openai/clip-vit-large-patch14"
max_length = 256
CLIP_MAX_LENGTH = 77
TEST_SIZE = 0.2
seed = 42

use_attention = True

roberta_dropout = 0.2
lstm_units = 512
dense_units = 50
lstm_dropout = 0.1
dense_dropout = 0.2
epochs = 30
batch_size = 32

def random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

random_seed(seed)

dataframe = pd.read_table(base_folder + input_file)

# Load CLIP model and tokenizer
clip_model = CLIPTextModelWithProjection.from_pretrained(clip_model).cuda()
clip_tokenizer = AutoTokenizer.from_pretrained(clip_model)

# Define base Roberta configs
config = RobertaConfig.from_pretrained(roberta_model, output_hidden_states=False, attention_probs_dropout_prob=roberta_dropout)
transformer_model = TFRobertaModel.from_pretrained(roberta_model, config=config, ignore_mismatched_sizes=True)

for layer in transformer_model.layers[:3]:  # Freeze first 3 layers
    layer.trainable = False

# Define tokenizer
tokenizer = RobertaTokenizer.from_pretrained(roberta_model, do_lower_case=True, add_special_tokens=True, max_length=max_length, pad_to_max_length=True)

input_ids_in = tf.keras.layers.Input(shape=(max_length,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(max_length,), name='masked_token', dtype='int32')
clip_input_in = tf.keras.layers.Input(shape=(CLIP_MAX_LENGTH,), name='clip_embed', dtype='float32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]

X = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(lstm_units, return_sequences=True, dropout=lstm_dropout, recurrent_dropout=lstm_dropout)
)(embedding_layer)

if use_attention:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])

X = tf.keras.layers.GlobalMaxPool1D()(X)

# Process CLIP embeddings
clip_processed = tf.keras.layers.Dense(dense_units, activation='relu')(clip_input_in)

# Concatenate CLIP embeddings
X_combined = tf.keras.layers.Concatenate(axis=-1)([X, clip_processed])

X = tf.keras.layers.Dense(dense_units, activation='relu')(X_combined)
X = tf.keras.layers.Dropout(dense_dropout)(X)
X = tf.keras.layers.Dense(1, activation='linear')(X)

model = tf.keras.Model(inputs=[input_ids_in, input_masks_in, clip_input_in], outputs=X)
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model.summary()

def tokenize(sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings, tokenizer, sentence_length):
    input_ids, input_masks, input_segments = [], [], []
    
    for sentence, subj_emb, verb_emb, obj_emb, svo_emb in tqdm(zip(sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings)):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=sentence_length,
                                       pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

        input_ids[-1].extend(svo_emb.tolist())
        input_masks[-1].extend([1] * len(svo_emb))
        input_segments[-1].extend([1] * len(svo_emb))
    
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

input_data = pd.read_table(base_folder + input_file)

sentences = input_data['Sentence'].tolist()
intensity_value = input_data['Intensity'].astype(int).tolist()



SVO_length = 128
subject_embeddings = (pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Subject"].tolist()], maxlen=SVO_length, padding='post'))
verb_embeddings = (pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Verb"].tolist()], maxlen=SVO_length, padding='post'))
object_embeddings = (pad_sequences([np.fromstring(embedding[1:-1], dtype=int, sep=',') for embedding in input_data["Object"].tolist()], maxlen=SVO_length, padding='post'))

svo_embeddings = subject_embeddings + verb_embeddings + object_embeddings

c = list(zip(intensity_value, sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings))
random.shuffle(c)
intensity_value, sentences, subject_embeddings, verb_embeddings, object_embeddings, svo_embeddings = zip(*c)

X_tr, X_te, y_tr, y_te, svo_tr, svo_te = train_test_split(sentences, intensity_value, svo_embeddings, test_size=TEST_SIZE, random_state=42)

sentence_length = 128
train_input_ids, train_input_masks, train_input_segment = tokenize(X_tr, subject_embeddings, verb_embeddings, object_embeddings, svo_tr, tokenizer, sentence_length)
test_input_ids, test_input_masks, test_input_segment = tokenize(X_te, subject_embeddings, verb_embeddings, object_embeddings, svo_te, tokenizer, sentence_length)

# Get CLIP embeddings for train and test sets
def get_clip_embeddings(sentences):
    clip_embeds = []
    for sentence in tqdm(sentences):
        clipinputs = clip_tokenizer(sentence, return_tensors="pt", padding=True, max_length=CLIP_MAX_LENGTH, truncation=True)
        outputs = clip_model(**clipinputs)
        clip_embeds.append(outputs.text_embeds.cpu().detach().numpy())
    return np.vstack(clip_embeds)

train_clip_embeds = get_clip_embeddings(X_tr)
test_clip_embeds = get_clip_embeddings(X_te)

y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)

# Train the model
model.fit(x=[train_input_ids, train_input_masks, train_clip_embeds], y=y_tr, epochs=epochs, validation_split=0.1, batch_size=batch_size)


  from .autonotebook import tqdm as notebook_tqdm


Num GPUs Available:  1


OSError: Incorrect path_or_model_id: 'CLIPTextModelWithProjection(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (text_projection): Linear(in_features=768, out_features=768, bias=False)
)'. Please provide either the path to a local folder or the repo_id of a model on the Hub.