<a href="https://colab.research.google.com/github/sara-kaczmarek/LLMB4ABSC/blob/main/TRY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ========== MAKE CHOICES ==========

# Choose dataset from options: "Lapt14", Rest14", "Rest15" or "Rest16"
dataset_choice = "Rest16"

domain = "laptop" if dataset_choice == "Lapt14" else "restaurant"

# Choose LLM's from options: "mistral", "llama", "gemma"
model_choice = "gemma"

In [None]:
# Only run first time
#!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
#!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
#!pip install transformers==4.51.3
#!pip install --no-deps unsloth
#!pip install sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
code_path = "/content/drive/My Drive/Master Thesis/Code"
if code_path not in sys.path:
    sys.path.append(code_path)

from data_prep import *
from evaluation_measures import *

import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import re

In [None]:
# Load LLM
from unsloth import FastLanguageModel
import torch

model_mapping = {
    "mistral": "unsloth/mistral-7b-v0.3-bnb-4bit",
    "llama": "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "gemma": "unsloth/gemma-2-9b-bnb-4bit"
}

max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_mapping[model_choice],
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map = "auto"
)

# Load SimCSE
from sentence_transformers import util, SentenceTransformer
model_sbert = SentenceTransformer("princeton-nlp/sup-simcse-roberta-base")

In [None]:
####### Preparation for Table 3 #######
df_train = load_xml_to_df(f"/content/drive/My Drive/Master Thesis/Data/Train_Data/{dataset_choice}_Train.xml")
df_test = load_xml_to_df(f"/content/drive/My Drive/Master Thesis/Data/Test_Data/{dataset_choice}_Test.xml")

df_train = compute_simcse_embeddings(df_train)

rename_cols = {"sentiment": "polarity", "original_aspect": "aspect", "generated_sentence": "sentence"}

base_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}"

df_synthetic= load_and_sample(f"{base_path}/Synthetic_{dataset_choice}_{model_choice}.csv", rename_cols, len(df_train))
df_synthetic = df_synthetic.dropna(subset=["generated_sentence"]).reset_index(drop=True)
df_synthetic = compute_simcse_embeddings(df_synthetic)

df_filtered = load_and_sample(f"{base_path}/Filtered_{dataset_choice}_{model_choice}.csv", rename_cols, len(df_train))
df_filtered = compute_simcse_embeddings(df_filtered)

In [None]:
####### Get Results for Table 3 #######

results_dict = {
    "Metric": []
}

shots = [3, 6]
strategies = ["random_equal", "simcse_equal"]  # Rb and SSb
few_shot_runs = {}

for k in shots:
    for scenario in strategies:
        print(f"\n\n========== Running {k}-Shot Scenario: {scenario} ==========\n")

        start = time.time()
        df_results_few = run_inference(
            df_test,
            sc_fewshot_prompt,
            df=df_train, # Choose df here
            model=model,
            tokenizer=tokenizer,
            k=k,
            scenario=scenario
        )
        end = time.time()

        metrics_few = evaluate_predictions(df_results_few)
        metrics_few["Time (seconds)"] = round(end - start, 4)

        if not results_dict["Metric"]:
            results_dict["Metric"] = list(metrics_few.keys())

        label = f"{k}-Shot ({scenario})"
        results_dict[label] = [metrics_few.get(metric, None) for metric in results_dict["Metric"]]

        print_confusion_matrix(df_results_few)
        few_shot_runs[f"{k}_{scenario}"] = df_results_few

df_combined = pd.DataFrame(results_dict)
display(df_combined)

output_path = f"/content/drive/My Drive/Master Thesis/Data/Results/FewShotAnnotatedResults_{dataset_choice}_{model_choice}.csv"
df_combined.to_csv(output_path, index=False)
print(f"File saved to {output_path}")