**VilBERT Classifier for complete balanced data set**

In [None]:
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
# load data frames
df_train = pd.read_csv('/kaggle/input/dataframes-bal/train_bal.csv')
df_val = pd.read_csv('/kaggle/input/dataframes-bal/val_bal.csv')
df_test = pd.read_csv('/kaggle/input/dataframes-bal/val_bal.csv')

# load joint embeddings
#with open('/kaggle/working/j_embeds_val.pkl', 'rb') as f:
    #v_embeds_train = pickle.load(f) # was not feasible due to data size (>18 GB)

print(joint_embeds_test[0].device)
# push joint embeddings from GPU
for tensor in joint_embeds_test:
    tensor.to('cpu')
# Delete the list to release memory on GPU
del joint_embeds_test

# get joint embeddings
with open('/kaggle/working/j_embeds_train.pkl', 'rb') as f:
    j_embeds_train = pickle.load(f) 
    
with open('/kaggle/working/j_embeds_val.pkl', 'rb') as f:
    j_embeds_val = pickle.load(f) 
    
with open('/kaggle/working/j_embeds_test.pkl', 'rb') as f:
    j_embeds_test = pickle.load(f) 

In [None]:
X_train = torch.stack(j_embeds_train, dim=0).cpu().numpy() # wahrscheinlich zu groß für GPU...
X_val = torch.stack(j_embeds_val, dim=0).cpu().numpy()
X_test = torch.stack(j_embeds_test, dim=0).cpu().numpy()
print(X_train.shape)
print(X_val.shape, X_test.shape)

In [None]:
#del some stuff, variables?
torch.cuda.empty_cache()
gc.collect()

In [None]:
def stack_tensors_on_cpu(tensor_list):
    chunk_size = 100  # Adjust this size based on your memory constraints
    stacked_tensors = []
    
    for i in range(0, len(tensor_list), chunk_size):
        chunk = tensor_list[i:i+chunk_size]
        #stacked_chunk = torch.stack(chunk, dim=0).cpu()
        stacked_chunk = torch.stack(chunk, dim=0)
        stacked_tensors.append(stacked_chunk)
        torch.cuda.empty_cache()
        gc.collect()
    
    return torch.cat(stacked_tensors, dim=0)

try:
    # Convert and stack tensors on CPU
    X_train = stack_tensors_on_cpu(j_embeds_train)
    X_val = stack_tensors_on_cpu(j_embeds_val)
    X_test = stack_tensors_on_cpu(j_embeds_test)
except RuntimeError as e:
    print(f"RuntimeError: {e}")
    # Handle large tensor conversion in chunks if necessary
    pass

In [None]:
# convert labels to numerical values
label_encoder = LabelEncoder()
y_train = df_train['AUTHOR'].tolist()
y_val = df_val['AUTHOR'].tolist()
y_test = df_test['AUTHOR'].tolist()

y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)
print(y_train[:10])
print(y_val[:10])
print(y_test[:10])

In [None]:
# just a check with less features
print(type(X_train))
print(type(X_test))
X_train_test = X_test[:90]
X_val_test = X_test[90:100]
X_test_test = X_test[100:]

y_train_test = y_test[:90]
y_val_test = y_test[90:100]
y_test_test = y_test[100:]

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(612, 768)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'), # 132, 64
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train_test, y_train_test, epochs=10, batch_size=16, validation_data=(X_val_test, y_val_test))

# Evaluate on validation set
val_loss, val_accuracy = model.evaluate(X_val_test, y_val_test)
print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

# Evaluate on test set
test_loss, test_accuracy = model.evaluate(X_test_test, y_test_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')


In [None]:
import matplotlib.pyplot as plt

# Plotting the training and validation loss and accuracy
plt.figure(figsize=(12, 5))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], 'cyan', label='Training Loss')
plt.plot(history.history['val_loss'], 'magenta', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], 'cyan', label='Training Accuracy')
plt.plot(history.history['val_accuracy'], 'magenta', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()