### Baseline using a simple CountVectorizer for emb_org_a

In [1]:
import os, json
import pandas as pd
import numpy as np
#!pip install --upgrade tensorflow
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

2024-05-29 11:55:55.855189: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 11:55:55.855259: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 11:55:55.856798: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
df_train = pd.read_csv('/kaggle/input/df_train.csv')
df_val = pd.read_csv('/kaggle/input/df_validation.csv')
df_test1 = pd.read_csv('/kaggle/input/test_df_0.csv')
df_test2 = pd.read_csv('/kaggle/input/test_df_1.csv')
df_test3 = pd.read_csv('/kaggle/input/test_df_2.csv')

# shuffle dataset
df_train = df_train.sample(frac=1, random_state=42)
df_val = df_val.sample(frac=1, random_state=42)
df_test1 = df_test1.sample(frac=1, random_state=42)
df_test2 = df_test2.sample(frac=1, random_state=42)
df_test3 = df_test3.sample(frac=1, random_state=42)

print(len(df_train))
print(len(df_val))
print(len(df_test1))
print(len(df_test2))
print(len(df_test3))

41594
11198
2236
2236
2236


### Configurations

- emb_org_a: original data, author_label only
- emb_org_b: orignal data, author_label and dataset_label
- emb_aug_a: augmented_data, author_label only
- emb_aug_b: augmented_data, author_label and dataset_label 

## Training and Saving models for emb_org_a (only author_label, original data set without augmentation)

In [3]:

# convert labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['label_author'])
y_val = label_encoder.transform(df_val['label_author'])
y_test1 = label_encoder.transform(df_test1['label_author'])
y_test2 = label_encoder.transform(df_test2['label_author'])
y_test3 = label_encoder.transform(df_test3['label_author'])

# get bow representation of paragraph1
vectorizer_par1 = CountVectorizer(max_features=5000)
X_train_par1 = vectorizer_par1.fit_transform(df_train['paragraph1']).toarray()
X_val_par1 = vectorizer_par1.transform(df_val['paragraph1']).toarray()
X_test1_par1 = vectorizer_par1.transform(df_test1['paragraph1']).toarray()
X_test2_par1 = vectorizer_par1.transform(df_test2['paragraph1']).toarray()
X_test3_par1 = vectorizer_par1.transform(df_test3['paragraph1']).toarray()

# get bow representation of paragraph2
vectorizer_par2 = CountVectorizer(max_features=5000)
X_train_par2 = vectorizer_par2.fit_transform(df_train['paragraph2']).toarray()
X_val_par2 = vectorizer_par2.transform(df_val['paragraph2']).toarray()
X_test1_par2 = vectorizer_par2.transform(df_test1['paragraph2']).toarray()
X_test2_par2 = vectorizer_par2.transform(df_test2['paragraph2']).toarray()
X_test3_par2 = vectorizer_par2.transform(df_test3['paragraph2']).toarray()

In [4]:
# Define the features for training, validation, and test  sets
# 1) emb_org_a
X_train = np.concatenate((X_train_par1, X_train_par2), axis=1)
X_val = np.concatenate((X_val_par1, X_val_par2), axis=1)
X_test1 = np.concatenate((X_test1_par1, X_test1_par2), axis=1)
X_test2 = np.concatenate((X_test2_par1, X_test2_par2), axis=1)
X_test3 = np.concatenate((X_test3_par1, X_test3_par2), axis=1)

In [9]:
# emb_org_a
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_val, y_val))


# --------- F1 Score ---------------- 

y_val_pred_prob = model.predict(X_val)
y_test1_pred_prob = model.predict(X_test1)
y_test2_pred_prob = model.predict(X_test2)
y_test3_pred_prob = model.predict(X_test3)


# convert probabilities to binary predictions (0 or 1)
threshold = 0.5
y_val_pred = (y_val_pred_prob >= threshold).astype(int)
y_test1_pred = (y_test1_pred_prob >= threshold).astype(int)
y_test2_pred = (y_test2_pred_prob >= threshold).astype(int)
y_test3_pred = (y_test3_pred_prob >= threshold).astype(int)


val_f1_macro = f1_score(y_val, y_val_pred, average='macro')
test1_f1_macro = f1_score(y_test1, y_test1_pred, average='macro')
test2_f1_macro = f1_score(y_test2, y_test2_pred, average='macro')
test3_f1_macro = f1_score(y_test3, y_test3_pred, average='macro')

print(f'Validation F1-Macro: {val_f1_macro}')
print(f'Test Set 1 F1-Macro: {test1_f1_macro}')
print(f'Test Set 2 F1-Macro: {test2_f1_macro}')
print(f'Test Set 3 F1-Macro: {test3_f1_macro}')

# -------------------------

#test_predictions = model.predict(X_test) # should be list of style changes
model.save("baseline_model_emb_org_a.h5") # author label only 

Epoch 1/10
[1m  75/2600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 2ms/step - accuracy: 0.6431 - loss: 0.6372

W0000 00:00:1716984401.744508     490 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6505 - loss: 0.5638

W0000 00:00:1716984409.827431     492 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.6505 - loss: 0.5638 - val_accuracy: 0.6624 - val_loss: 0.5287
Epoch 2/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.7549 - loss: 0.4563 - val_accuracy: 0.6629 - val_loss: 0.5561
Epoch 3/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9044 - loss: 0.2291 - val_accuracy: 0.6595 - val_loss: 0.8443
Epoch 4/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9867 - loss: 0.0509 - val_accuracy: 0.6523 - val_loss: 1.4911
Epoch 5/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9971 - loss: 0.0138 - val_accuracy: 0.6440 - val_loss: 1.9499
Epoch 6/10
[1m2600/2600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9975 - loss: 0.0107 - val_accuracy: 0.6542 - val_loss: 2.0211
Epoch 7/10
[1m2600/2600[