# Experiments

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
code_path = "/content/drive/My Drive/Master Thesis/Code"
if code_path not in sys.path:
    sys.path.append(code_path)

from data_prep import *
from R-GAT import *
from BERT-SPC import *
from ATAE-LSTM import *

# Install when running on Google Collab
#!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.6.0+cu124.html
#!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.6.0+cu124.html
#!pip install torch-geometric
#!pip install gensim

## Data Preparation

In [34]:
# Pick dataset and model
dataset_choice = "Lapt14"
model_choice = "mistral"

In [None]:
####### Preparation for Baseline #######

df_train = load_xml_to_df(f"/content/drive/My Drive/Master Thesis/Data/Train_Data/{dataset_choice}_Train.xml")
df_test = load_xml_to_df(f"/content/drive/My Drive/Master Thesis/Data/Test_Data/{dataset_choice}_Test.xml")

In [None]:
####### Preparation for Table 4 #######

rename_cols = {"sentiment": "polarity", "used_extended_aspect": "aspect", "generated_sentence": "sentence"}

# Sample Synthetic and Filtered
base_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}"

df_sampled_synthetic = load_and_sample(f"{base_path}/Synthetic_{dataset_choice}_{model_choice}.csv", rename_cols, len(df_train))
df_sampled_synthetic = clean_sentences(df_sampled_synthetic)

df_sampled_filtered = load_and_sample(f"{base_path}/Filtered_{dataset_choice}_{model_choice}.csv", rename_cols, len(df_train))
df_sampled_filtered = clean_sentences(df_sampled_filtered)

# Sample Mixed (50/50 Annotated and Filtered)
n_half = len(df_train) // 2
df_annotated_sample = df_train.sample(n=n_half, random_state=42).reset_index(drop=True)
df_synthetic_sample = df_sampled_filtered.sample(n=n_half, random_state=42).reset_index(drop=True)

df_mixed = pd.concat([df_annotated_sample, df_synthetic_sample], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
df_mixed = clean_sentences(df_mixed)

In [None]:
####### Preparation for Table 5 #######

# Load LLM-Rewriting, LLM-Annotating, and IDG datasets
llm_r_path = f"/content/drive/My Drive/Master Thesis/Data/DA/LLM_R_{dataset_choice}_{model_choice}.csv"
llm_a_path = f"/content/drive/My Drive/Master Thesis/Data/DA/LLM_A_{dataset_choice}_{model_choice}.csv"
idg_df_path = f"/content/drive/My Drive/Master Thesis/Data/DA/LLM_IDG_{dataset_choice}_{model_choice}.csv"

llm_r_df = load_and_encode(llm_r_path)
llm_a_df = load_and_encode(llm_a_path)
idg_df = load_and_encode(idg_df_path)

# Load and Sample IDG4ABSC
target_size = len(df_train)
sentiments = df_filtered['polarity'].unique()
per_class_target = target_size // len(sentiments)
counts = df_filtered['polarity'].value_counts()
n_each = min(per_class_target, counts.min())

df_synthetic_balanced = pd.concat([
    df_filtered[df_filtered['polarity'] == s].sample(n=n_each, random_state=42)
    for s in sentiments
], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

df_augmented = pd.concat([df_train, df_synthetic_balanced], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

## Get Results

In [None]:
####### Get Baseline Results #######

model = run_rgat(df_train, df_test)

In [None]:
####### Get Results for Table 4 #######

# R-GAT Results
model = run_rgat(df_sampled_synthetic, df_test)
model = run_rgat(df_sampled_filtered, df_test)
model = run_rgat(df_mixed, df_test)

# BERT-SPC Results
model, tokenizer, label_encoder = bert_spc(df_sampled_synthetic, df_test)
model, tokenizer, label_encoder = bert_spc(df_sampled_filtered, df_test)
model, tokenizer, label_encoder = bert_spc(df_mixed, df_test)

# ATAE-LSTM Results
model = run_atae_lstm(df_sampled_synthetic, df_test)
model = run_atae_lstm(df_sampled_filtered, df_test)
model = run_atae_lstm(df_mixed, df_test)

In [None]:
####### Get Results for Table 5 #######

model = run_rgat(llm_r_df, df_test)
model = run_rgat(llm_a_df, df_test)
model = run_rgat(idg_df, df_test)
model = run_rgat(df_augmented, df_test)