<a href="https://colab.research.google.com/github/sara-kaczmarek/LLMB4ABSC/blob/main/LLMB4ABSC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LLMB4ABSC

In [None]:
# ========== MAKE CHOICES ==========

# Choose dataset from options: "Lapt14", Rest14", "Rest15" or "Rest16"
dataset_choice = "Rest16"

domain = "laptop" if dataset_choice == "Lapt14" else "restaurant"

# Choose LLM's from options: "mistral", "llama", "gemma"
model_choice = "gemma"

In [None]:
# Only run first time
#!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
#!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
#!pip install transformers==4.51.3
#!pip install --no-deps unsloth
#!pip install sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
code_path = "/content/drive/My Drive/Master Thesis/Code"
if code_path not in sys.path:
    sys.path.append(code_path)

from data_prep import *
from evaluation.evaluation_measures import *
from IDG4ABSC import *
from LLM4ABSC import *

import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import re

Mounted at /content/drive


In [None]:
# Load LLM
from unsloth import FastLanguageModel
import torch

model_mapping = {
    "mistral": "unsloth/mistral-7b-v0.3-bnb-4bit",
    "llama": "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "gemma": "unsloth/gemma-2-9b-bnb-4bit"
}

max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_mapping[model_choice],
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map = "auto"
)

# Load SimCSE
from sentence_transformers import util, SentenceTransformer
model_sbert = SentenceTransformer("princeton-nlp/sup-simcse-roberta-base")

# Load Spacy
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
####### Load Test Data #######
df_test = load_xml_to_df(f"/content/drive/My Drive/Master Thesis/Data/Test_Data/{dataset_choice}_Test.xml")

## IDG4ABSC

In [None]:
# AX
start = time.time()

df_expansions = generate_aspect_expansions(df_test, model, tokenizer, domain=domain, device="cuda") ## Running on SUBSET

end = time.time()

display(df_expansions)
print(f"\nTotal time AX: {end - start:.2f} seconds")


df_expansions_filtered = filter_expanded_aspects_to_nouns(df_expansions, nlp)
display(df_expansions_filtered)
all_terms = set()

for aspect_string in df_expansions["expanded_aspects"]:
    terms = [term.strip() for term in aspect_string.split(",")]
    all_terms.update(terms)

print(f"Total unique expanded aspects: {len(all_terms)}")

#output_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Aspects_{dataset_choice}_{model_choice}.csv"
#df_expansions_filtered.to_csv(output_path, index=False)

Generating expansions: 100%|██████████| 288/288 [08:15<00:00,  1.72s/it]


Unnamed: 0,original_aspect,expanded_aspects
0,sushi,"sushi, sashimi, nigiri, maki, california roll"
1,portions,"portion, portions, portion size, portion sizes..."
2,green tea creme brulee,green tea creme brulee
3,place,"location, restaurant, venue, spot, area, place"
4,service,"service, staff, waiter, waitress, server"
...,...,...
283,shilshole sampler,"shilshole sampler, shilshole, sampler"
284,seared alaskan sea scallops,"seared scallops, sea scallops, scallops, seare..."
285,grilled alaskan king salmon,"grilled salmon, salmon, alaskan king salmon, a..."
286,creamed washington russet potatoes,creamed washington russet potatoes



⏱️ Total time taken: 495.35 seconds


In [None]:
# DG

#input_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Aspects_{dataset_choice}_{model_choice}.csv"
#df_expansions_filtered = pd.read_csv(input_path)

start = time.time()

df_synthetic = generate_synthetic_sentences_from_expansions(
    df_expansions_filtered,
    dataset_choice=dataset_choice,
    model=model,
    tokenizer=tokenizer,
    n_per_sentiment=10
)

end = time.time()
print(f"\Total time for DG: {end - start:.2f} seconds")

display(df_synthetic)

# Save zero-shot synthetic data for evaluation
output_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Synthetic_{dataset_choice}_{model_choice}.csv"
df_synthetic.to_csv(output_path, index=False)

Generating Synthetic Sentences: 100%|██████████| 288/288 [4:54:31<00:00, 61.36s/it]


⏱️ Total time for synthetic generation: 17671.15 seconds





Unnamed: 0,original_aspect,used_extended_aspect,sentiment,generated_sentence
0,sushi,sashimi,positive,The best sushi I have ever eaten is right here!
1,sushi,california roll,positive,The California rolls were fresh and delicious!
2,sushi,nigiri,positive,The salmon sashimi at this sushi bar is some g...
3,sushi,sushi,positive,I absolutely love their sushi!
4,sushi,california roll,positive,The California rolls were absolutely delicious!
...,...,...,...,...
8635,green beans,green bean,negative,The food here is not good at all!
8636,green beans,green bean,negative,This dish was not as fresh or well prepared as...
8637,green beans,green bean,negative,The food was average but expensive for what we...
8638,green beans,green bean,negative,I was really disappointed with their vegetaria...


File saved to /content/drive/My Drive/Master Thesis/Data/gemma/Synthetic_Rest16_gemma.csv


In [None]:
# QFP

#input_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Synthetic_{dataset_choice}_{model_choice}.csv"
#df_synthetic = pd.read_csv(input_path)

start = time.time()

df_synthetic_filtered, _ = manual_filter(df_synthetic) # Rule-based Filter
df_synthetic_filtered, drop_summary_df = llm_filter(df_synthetic_filtered, model, tokenizer, domain) # LLM Filter

end = time.time()
print(f"\nTotal time for filtering: {end - start:.2f} seconds")

display(df_synthetic_filtered)
display(drop_summary_df)

# Save initial set of filtered triplers
output_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/HighQuality_Synthetic_{dataset_choice}_{model_choice}.csv"
df_synthetic_filtered.to_csv(output_path, index=False)

Applying filters: 100%|██████████| 4228/4228 [3:12:45<00:00,  2.74s/it]


⏱️ Total time for filtering: 11565.90 seconds





Unnamed: 0,generated_sentence,original_aspect,used_extended_aspect,sentiment
0,The best sushi I have ever eaten is right here!,sushi,sashimi,positive
1,The California rolls were fresh and delicious!,sushi,california roll,positive
2,I absolutely love their sushi!,sushi,sushi,positive
3,The California rolls were absolutely delicious!,sushi,california roll,positive
4,The California Roll was absolutely delicious!,sushi,california roll,positive
...,...,...,...,...
2820,Our vegetarian pasta with fresh spinach was ve...,green beans,green beans,neutral
2821,The quality of food was fine but I didn't like...,green beans,green beans,neutral
2822,The beans were undercooked!,green beans,green bean,negative
2823,This dish was not as fresh or well prepared as...,green beans,green bean,negative


Unnamed: 0,Stage,Dropped
0,domain_answered_no,805
1,neutral_emotion_expressed,103
2,aspect_sentiment_answered_no,356
3,fluency_answered_no,139


File saved to /content/drive/My Drive/Master Thesis/Data/gemma/HighQuality_Synthetic_Rest16_gemma.csv


In [None]:
# IDG
#input_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/HighQuality_Synthetic_{dataset_choice}_{model_choice}.csv"
#df_synthetic_filtered = pd.read_csv(input_path)

#input_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Aspects_{dataset_choice}_{model_choice}.csv"
#df_expansions_filtered = pd.read_csv(input_path)

start = time.time()

df_filtered = iterative_DG(
    df_synthetic_filtered=df_synthetic_filtered,
    df_expansions_filtered=df_expansions_filtered,
    model=model,
    tokenizer=tokenizer,
    dataset_choice=dataset_choice,
    manual_filter=manual_filter,
    llm_filter=llm_filter,
    n_per_sentiment=10,
    max_attempts=10,
    device="cuda"
)

end = time.time()
print(f"\nTotal time for iterative data generation: {end - start:.2f} seconds")

# Save IDG4ABSC-generated data
output_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Filtered_{dataset_choice}_{model_choice}.csv"
df_filtered.to_csv(output_path, index=False)

## LLM4ABSC

In [None]:
# Open Synthetic Data
input_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Synthetic_{dataset_choice}_{model_choice}.csv"
df_synthetic = pd.read_csv(input_path)
display(df_synthetic)

# Open Filtered Data
input_path = f"/content/drive/My Drive/Master Thesis/Data/{model_choice}/Filtered_{dataset_choice}_{model_choice}.csv"
df_filtered = pd.read_csv(input_path)
display(df_filtered)

Unnamed: 0,original_aspect,used_extended_aspect,sentiment,generated_sentence
0,sushi,sashimi,positive,The best sushi I have ever eaten is right here!
1,sushi,california roll,positive,The California rolls were fresh and delicious!
2,sushi,nigiri,positive,The salmon sashimi at this sushi bar is some g...
3,sushi,sushi,positive,I absolutely love their sushi!
4,sushi,california roll,positive,The California rolls were absolutely delicious!
...,...,...,...,...
8635,green beans,green bean,negative,The food here is not good at all!
8636,green beans,green bean,negative,This dish was not as fresh or well prepared as...
8637,green beans,green bean,negative,The food was average but expensive for what we...
8638,green beans,green bean,negative,I was really disappointed with their vegetaria...


Unnamed: 0,generated_sentence,original_aspect,used_extended_aspect,sentiment
0,The best sushi I have ever eaten is right here!,sushi,sashimi,positive
1,The California rolls were fresh and delicious!,sushi,california roll,positive
2,I absolutely love their sushi!,sushi,sushi,positive
3,The California rolls were absolutely delicious!,sushi,california roll,positive
4,The California Roll was absolutely delicious!,sushi,california roll,positive
...,...,...,...,...
4298,This is not what i would consider an authentic...,creamed washington russet potatoes,creamed washington russet potatoes,negative
4299,I love eating vegetables so much when they hav...,green beans,green bean,positive
4300,We tried their Green Bean casserole which was ...,green beans,green bean,positive
4301,These Green Beans were just what i needed toni...,green beans,green bean,positive


In [None]:
# SC
results_dict = {
    "Metric": [],
    "Zero-Shot": []
}

start_time = time.time()
df_results_zero = run_inference(df_test, sc_prompt, model=model, tokenizer=tokenizer)
end_time = time.time()

metrics_zero = evaluate_predictions(df_results_zero)
metrics_zero["Time (seconds)"] = round(end_time - start_time, 4)

results_dict["Metric"] = list(metrics_zero.keys())
results_dict["Zero-Shot"] = list(metrics_zero.values())


print_confusion_matrix(df_results_zero)

scenarios = ["random", "random_equal", "simcse", "simcse_equal"]
few_shot_runs = {}

for scenario in scenarios:
    print(f"\n\n========== Running Few-Shot Scenario: {scenario} ==========\n")

    start = time.time()
    df_results_few = run_inference(
        df_test,
        sc_fewshot_prompt,
        df=df_filtered, # change df here
        model=model,
        tokenizer=tokenizer,
        k=3, # change few-shot number here
        scenario=scenario
    )
    end = time.time()

    metrics_few = evaluate_predictions(df_results_few)
    metrics_few["Time (seconds)"] = round(end - start, 4)

    results_dict[f"Few-Shot ({scenario})"] = [metrics_few.get(metric, None) for metric in results_dict["Metric"]]

    print_confusion_matrix(df_results_few)
    few_shot_runs[scenario] = df_results_few

df_combined = pd.DataFrame(results_dict)
display(df_combined)

output_path = f"/content/drive/My Drive/Master Thesis/Data/Results/3shot-FilteredResults_{dataset_choice}_{model_choice}.csv"
df_combined.to_csv(output_path, index=False)
print(f"File saved to {output_path}")