### Baseline using a simple CountVectorizer

In [1]:
import os, json
import pandas as pd
import numpy as np
#!pip install --upgrade tensorflow
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

2024-05-23 15:49:18.474967: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-23 15:49:18.475035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-23 15:49:18.477107: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df_train = pd.read_csv('/kaggle/input/lp-data/df_train.csv')
df_val = pd.read_csv('/kaggle/input/lp-data/df_validation.csv')
df_test = pd.read_csv('/kaggle/input/lp-data/df_test.csv')

# shuffle dataset
df_train = df_train.sample(frac=1, random_state=42)
df_val = df_val.sample(frac=1, random_state=42)
df_test = df_test.sample(frac=1, random_state=42)

In [None]:
df_train_bal = pd.read_csv('/kaggle/input/lp-data/balanced_train.csv')
df_val_bal = pd.read_csv('/kaggle/input/lp-data/balanced_val.csv')
df_test_bal = pd.read_csv('/kaggle/input/lp-data/balanced_test.csv')

# shuffle dataset
df_train_bal = df_train_bal.sample(frac=1, random_state=42)
df_val_bal = df_val_bal.sample(frac=1, random_state=42)
df_test_bal = df_test_bal.sample(frac=1, random_state=42)

In [None]:
print(len(df_train))
print(len(df_train_bal))
df_train.head()

### Configurations

- emb_org_a: original data, author_label only
- emb_org_b: orignal data, author_label and dataset_label
- emb_aug_a: augmented_data, author_label only
- emb_aug_b: augmented_data, author_label and dataset_label 

## 1) Training and Saving models for emb_org_a and emb_aug_a (only author_label)

In [None]:
# 1) labels for emb_org_a

# convert labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['label_author'])
y_val = label_encoder.transform(df_val['label_author'])
y_test = label_encoder.transform(df_test['label_author'])

# get bow representation of paragraph1
vectorizer_par1 = CountVectorizer(max_features=5000)
X_train_par1 = vectorizer_par1.fit_transform(df_train['paragraph1']).toarray()
X_val_par1 = vectorizer_par1.transform(df_val['paragraph1']).toarray()
X_test_par1 = vectorizer_par1.transform(df_test['paragraph1']).toarray()

# get bow representation of paragraph2
vectorizer_par2 = CountVectorizer(max_features=5000)
X_train_par2 = vectorizer_par2.fit_transform(df_train['paragraph2']).toarray()
X_val_par2 = vectorizer_par2.transform(df_val['paragraph2']).toarray()
X_test_par2 = vectorizer_par2.transform(df_test['paragraph2']).toarray()

In [None]:
# 2) labels for emb_aug_a

# convert labels to numerical values
label_encoder_bal = LabelEncoder()
y_train_bal = label_encoder_bal.fit_transform(df_train_bal['label_author'])
y_val_bal = label_encoder_bal.transform(df_val_bal['label_author'])
y_test_bal = label_encoder_bal.transform(df_test_bal['label_author'])

# get bow representation of paragraph1
vectorizer_par1_bal = CountVectorizer(max_features=5000)
X_train_par1_bal = vectorizer_par1_bal.fit_transform(df_train_bal['paragraph1']).toarray()
X_val_par1_bal = vectorizer_par1_bal.transform(df_val_bal['paragraph1']).toarray()
X_test_par1_bal = vectorizer_par1_bal.transform(df_test_bal['paragraph1']).toarray()

# get bow representation of paragraph2
vectorizer_par2_bal = CountVectorizer(max_features=5000)
X_train_par2_bal = vectorizer_par2_bal.fit_transform(df_train_bal['paragraph2']).toarray()
X_val_par2_bal = vectorizer_par2_bal.transform(df_val_bal['paragraph2']).toarray()
X_test_par2_bal = vectorizer_par2_bal.transform(df_test_bal['paragraph2']).toarray()

In [None]:
# Define the features for training, validation, and test  sets
# 1) emb_org_a
X_train = np.concatenate((X_train_par1, X_train_par2), axis=1)
X_val = np.concatenate((X_val_par1, X_val_par2), axis=1)
X_test = np.concatenate((X_test_par1, X_test_par2), axis=1)


In [None]:
#2) emb_aug_a
#X_train_bal = torch.from_numpy(np.concatenate((X_train_par1_bal, X_train_par2_bal), axis=1)).to('cuda')
#X_val_bal = torch.from_numpy(np.concatenate((X_val_par1_bal, X_val_par2_bal), axis=1)).to('cuda')
#X_test_bal = torch.from_numpy(np.concatenate((X_test_par1_bal, X_test_par2_bal), axis=1)).to('cuda')
X_train_bal = np.concatenate((X_train_par1_bal, X_train_par2_bal), axis=1)
X_val_bal = np.concatenate((X_val_par1_bal, X_val_par2_bal), axis=1)
X_test_bal = np.concatenate((X_test_par1_bal, X_test_par2_bal), axis=1)

In [None]:
# emb_org_a
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_val, y_val))

# evaluate on validation set
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

# evaluate on test set 
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# --------- F1 Score ---------------- 

# make predictions needed for F1 score
y_val_pred = (model.predict(X_val) > 0.5).astype("int32")
y_test_pred = (model.predict(X_test) > 0.5).astype("int32")

# calculate F1 macro
val_f1_macro = f1_score(y_val, y_val_pred, average='macro')
test_f1_macro = f1_score(y_test, y_test_pred, average='macro')

print(f'Validation F1-Macro: {val_f1_macro}')
print(f'Test F1-Macro: {test_f1_macro}')

# -------------------------

#test_predictions = model.predict(X_test) # should be list of style changes
model.save("baseline_model_emb_org_a.h5") # author label only 

In [None]:
# emb_aug_a, data set is so big that it allocates too much memory. TO DO: train with smaller data set of augmented data or push to cuda
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train_bal.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train_bal, y_train_bal, epochs=10, batch_size=16, validation_data=(X_val_bal, y_val_bal))

# evaluate on validation set
val_loss, val_accuracy = model.evaluate(X_val_bal, y_val_bal)
print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

# evaluate on test set 
test_loss, test_accuracy = model.evaluate(X_test_bal, y_test_bal)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

# --------- F1 Score ---------------- actually not needed because dataset is balanced

# make predictions needed for F1 score
y_val_pred = (model.predict(X_val_bal) > 0.5).astype("int32")
y_test_pred = (model.predict(X_test_bal) > 0.5).astype("int32")

# calculate F1 macro
val_f1_macro = f1_score(y_val_bal, y_val_pred_bal, average='macro')
test_f1_macro = f1_score(y_test_bal, y_test_pred_bal, average='macro')

print(f'Validation F1-Macro: {val_f1_macro}')
print(f'Test F1-Macro: {test_f1_macro}')

# -------------------------


#test_predictions = model.predict(X_test) # should be list of style changes
model.save("baseline_model_emb_aug_a.h5") # author label only

## 2) Training and Saving models for emb_org_b and emb_aug_b (Multi-Label Classification for label_author and label_dataset)

In [3]:
# 1) embed_org_b

# Encode the labels
label_encoder_author = LabelEncoder()
label_encoder_dataset = LabelEncoder()

y_train_author = label_encoder_author.fit_transform(df_train['label_author'])
y_val_author = label_encoder_author.transform(df_val['label_author'])
y_test_author = label_encoder_author.transform(df_test['label_author'])

y_train_dataset = label_encoder_dataset.fit_transform(df_train['label_dataset'])
y_val_dataset = label_encoder_dataset.transform(df_val['label_dataset'])
y_test_dataset = label_encoder_dataset.transform(df_test['label_dataset'])


# stack labels to create multilabel
y_train_multilabel = np.column_stack((y_train_author, y_train_dataset))
y_val_multilabel = np.column_stack((y_val_author, y_val_dataset))
y_test_multilabel = np.column_stack((y_test_author, y_test_dataset))


# Get BOW representation of paragraph1
vectorizer_par1 = CountVectorizer(max_features=5000)
X_train_par1 = vectorizer_par1.fit_transform(df_train['paragraph1']).toarray()
X_val_par1 = vectorizer_par1.transform(df_val['paragraph1']).toarray()
X_test_par1 = vectorizer_par1.transform(df_test['paragraph1']).toarray()

# Get BOW representation of paragraph2
vectorizer_par2 = CountVectorizer(max_features=5000)
X_train_par2 = vectorizer_par2.fit_transform(df_train['paragraph2']).toarray()
X_val_par2 = vectorizer_par2.transform(df_val['paragraph2']).toarray()
X_test_par2 = vectorizer_par2.transform(df_test['paragraph2']).toarray()

# Concatenate paragraph1 and paragraph2 BOW representations
X_train = np.concatenate((X_train_par1, X_train_par2), axis=1)
X_val = np.concatenate((X_val_par1, X_val_par2), axis=1)
X_test = np.concatenate((X_test_par1, X_test_par2), axis=1)

In [4]:
print(X_train.shape)
print(y_train_multilabel.shape)

print(X_val.shape)
print(y_val_multilabel.shape)
print(y_train_multilabel[0:5])

(41594, 10000)
(41594, 2)
(11198, 10000)
(11198, 2)
[[0 1]
 [1 1]
 [0 2]
 [0 2]
 [1 0]]


In [6]:
input_shape = X_train.shape[1]

input_layer = tf.keras.layers.Input(shape=(input_shape,))
dense1 = tf.keras.layers.Dense(128, activation='relu')(input_layer)
dense2 = tf.keras.layers.Dense(64, activation='relu')(dense1)

# Binary label output layer
author_output = tf.keras.layers.Dense(1, activation='sigmoid', name='author_output')(dense2)

# Multiclass label output layer
dataset_output = tf.keras.layers.Dense(3, activation='softmax', name='dataset_output')(dense2)


# output = tf.keras.layers.Dense(2, activation='softmax', name='output')(dense2) # for 1 stacked multilabel
#model = tf.keras.models.Model(inputs=input_layer, outputs=output) # for 1 stacked multilabel

model = tf.keras.models.Model(inputs=input_layer, outputs=[author_output, dataset_output])

model.compile(optimizer='adam',
              loss={'author_output': 'binary_crossentropy', 'dataset_output': 'categorical_crossentropy'},
              metrics=['accuracy', 'accuracy'])

'''#Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])''' # for 1 stacked multilabel

# train model
history = model.fit(X_train, {'author_output': y_train_author, 'dataset_output': tf.keras.utils.to_categorical(y_train_dataset)},
                    epochs=10, batch_size=16,
                    validation_data=(X_val, {'author_output': y_val_author, 'dataset_output': tf.keras.utils.to_categorical(y_val_dataset)}))

#history = model.fit(X_train, y_train_multilabel, epochs=10, batch_size=16,
                  #  validation_data=(X_val, y_val_multilabel)) # for 1 stacked multilabel


# Save the model
model.save("baseline_model_emb_org_b.h5")

Epoch 1/10
[1m  68/2600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 2ms/step - author_output_accuracy: 0.6586 - dataset_output_accuracy: 0.4884 - loss: 1.6529

I0000 00:00:1716479405.195668    2075 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1716479405.210981    2075 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - author_output_accuracy: 0.6549 - dataset_output_accuracy: 0.5683 - loss: 1.4333

W0000 00:00:1716479414.726413    2075 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - author_output_accuracy: 0.6549 - dataset_output_accuracy: 0.5683 - loss: 1.4333 - val_author_output_accuracy: 0.6667 - val_dataset_output_accuracy: 0.6021 - val_loss: 1.3336
Epoch 2/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - author_output_accuracy: 0.7458 - dataset_output_accuracy: 0.7233 - loss: 1.0842 - val_author_output_accuracy: 0.6638 - val_dataset_output_accuracy: 0.5923 - val_loss: 1.4709
Epoch 3/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - author_output_accuracy: 0.8776 - dataset_output_accuracy: 0.8720 - loss: 0.6058 - val_author_output_accuracy: 0.6450 - val_dataset_output_accuracy: 0.5733 - val_loss: 1.9760
Epoch 4/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - author_output_accuracy: 0.9642 - dataset_output_accuracy: 0.9549 - loss: 0.2417 - val_author_output_accuracy: 0.6453 - val_datas

In [14]:
# TO DO: Evaluate on validation set

''''val_loss, val_author_loss, val_dataset_loss, val_author_accuracy, val_dataset_accuracy = model.evaluate(X_val, [y_val_author, y_val_dataset], verbose=2)
print(f'Validation Loss: {val_loss}, Validation Author Loss: {val_author_loss}, Validation Dataset Loss: {val_dataset_loss}')
print(f'Validation Author Accuracy: {val_author_accuracy}, Validation Dataset Accuracy: {val_dataset_accuracy}')

# Evaluate on test set
test_loss, test_author_loss, test_dataset_loss, test_author_accuracy, test_dataset_accuracy = model.evaluate(X_test, [y_test_author, y_test_dataset], verbose=2)
print(f'Test Loss: {test_loss}, Test Author Loss: {test_author_loss}, Test Dataset Loss: {test_dataset_loss}')
print(f'Test Author Accuracy: {test_author_accuracy}, Test Dataset Accuracy: {test_dataset_accuracy}')

# Make predictions needed for F1 score
y_val_pred_author = (model.predict(X_val)[0] > 0.5).astype("int32")
y_val_pred_dataset = (model.predict(X_val)[1] > 0.5).astype("int32")

y_test_pred_author = (model.predict(X_test)[0] > 0.5).astype("int32")
y_test_pred_dataset = (model.predict(X_test)[1] > 0.5).astype("int32")

# Calculate F1 macro
val_f1_macro_author = f1_score(y_val_author, y_val_pred_author, average='macro')
val_f1_macro_dataset = f1_score(y_val_dataset, y_val_pred_dataset, average='macro')

test_f1_macro_author = f1_score(y_test_author, y_test_pred_author, average='macro')
test_f1_macro_dataset = f1_score(y_test_dataset, y_test_pred_dataset, average='macro')

print(f'Validation Author F1-Macro: {val_f1_macro_author}')
print(f'Validation Dataset F1-Macro: {val_f1_macro_dataset}')
print(f'Test Author F1-Macro: {test_f1_macro_author}')
print(f'Test Dataset F1-Macro: {test_f1_macro_dataset}')'''

'\'val_loss, val_author_loss, val_dataset_loss, val_author_accuracy, val_dataset_accuracy = model.evaluate(X_val, [y_val_author, y_val_dataset], verbose=2)\nprint(f\'Validation Loss: {val_loss}, Validation Author Loss: {val_author_loss}, Validation Dataset Loss: {val_dataset_loss}\')\nprint(f\'Validation Author Accuracy: {val_author_accuracy}, Validation Dataset Accuracy: {val_dataset_accuracy}\')\n\n# Evaluate on test set\ntest_loss, test_author_loss, test_dataset_loss, test_author_accuracy, test_dataset_accuracy = model.evaluate(X_test, [y_test_author, y_test_dataset], verbose=2)\nprint(f\'Test Loss: {test_loss}, Test Author Loss: {test_author_loss}, Test Dataset Loss: {test_dataset_loss}\')\nprint(f\'Test Author Accuracy: {test_author_accuracy}, Test Dataset Accuracy: {test_dataset_accuracy}\')\n\n# Make predictions needed for F1 score\ny_val_pred_author = (model.predict(X_val)[0] > 0.5).astype("int32")\ny_val_pred_dataset = (model.predict(X_val)[1] > 0.5).astype("int32")\n\ny_test_