In [None]:
# ========== MAKE CHOICES ==========

# Choose dataset from options: "Lapt14", Rest14", "Rest15" or "Rest16"
dataset_choice = "Rest14"

domain = "laptop" if dataset_choice == "Lapt14" else "restaurant"

# Choose LLM's from options: "mistral", "llama", "gemma"
model_choice = "gemma"

In [1]:
# Only run first time
#pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
#!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
#!pip install transformers==4.51.3
#!pip install --no-deps unsloth
#!pip install sentence-transformers

from unsloth import FastLanguageModel
import torch

model_mapping = {
    "mistral": "unsloth/mistral-7b-v0.3-bnb-4bit",
    "llama": "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    "gemma": "unsloth/gemma-2-9b-bnb-4bit"
}

max_seq_length = 1024
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_mapping[model_choice],
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    device_map = "auto"
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
code_path = "/content/drive/My Drive/Master Thesis/Code"
if code_path not in sys.path:
    sys.path.append(code_path)

from data_prep import load_xml_to_df

import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import re

Mounted at /content/drive


In [None]:
def rewrite_prompt(sentence, aspect, sentiment, model, tokenizer, max_tokens=60, device="cuda"):
    prompt = (
        "You are an expert at rewriting sentences while keeping the same meaning, aspect, and sentiment.\n"
        "Given a sentence, aspect, and sentiment label, rewrite it in a natural and fluent way.\n"
        "Do not change the sentiment or aspect.\n\n"
        f"Sentence: {sentence}\n"
        f"Aspect: {aspect}\n"
        f"Sentiment: {sentiment}\n"
        "Rewritten sentence:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.9,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    rewritten = decoded.replace(prompt, "").strip()
    rewritten = rewritten.split(".")[0].strip() + "."
    return rewritten

df_train = load_xml_to_df(f"/content/drive/My Drive/Master Thesis/Data/Train_Data/{dataset_choice}_Train.xml")

rewritten_records = []

for _, row in tqdm(df_train.iterrows(), total=len(df_train), desc="Rewriting sentences"):
    orig_sentence = row["sentence"]
    aspect = row["aspect"]
    sentiment = row["polarity"]

    try:
        rewritten_sentence = rewrite_prompt(orig_sentence, aspect, sentiment, model, tokenizer, device="cuda")

        if rewritten_sentence and rewritten_sentence != orig_sentence:
            rewritten_records.append({
                "sentence": rewritten_sentence,
                "aspect": aspect,
                "polarity": sentiment
            })

    except Exception as e:
        print(f"Error rewriting row: {e}")

df_rewritten = pd.DataFrame(rewritten_records)
df_augmented = pd.concat([df_train, df_rewritten], ignore_index=True)

In [None]:
output_path = f"/content/drive/My Drive/Master Thesis/Data/DA/LLM_R_{dataset_choice}_{model_choice}.csv"
df_augmented.to_csv(output_path, index=False)
print(f"File saved to {output_path}")

display(df_augmented)

In [None]:
def annotate_prompt(sentence, aspect, model, tokenizer, max_tokens=20, device="cuda"):
    prompt = (
        "You are an expert in aspect-based sentiment analysis.\n"
        "Given a sentence and an aspect, determine whether the sentiment expressed towards that aspect is positive, neutral, or negative.\n"
        f"Sentence: {sentence}\n"
        f"Aspect: {aspect}\n"
        "Sentiment:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.0,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    sentiment = decoded.replace(prompt, "").strip().split()[0].lower()

    return sentiment

df_train = load_xml_to_df(f"/content/drive/My Drive/Master Thesis/Data/Train_Data/{dataset_choice}_Train.xml")

annotated_records = []

for _, row in tqdm(df_train.iterrows(), total=len(df_train), desc="Annotating sentences"):
    orig_sentence = row["sentence"]
    aspect = row["aspect"]

    try:
        predicted_sentiment = annotate_prompt(orig_sentence, aspect, model, tokenizer, device="cuda")

        annotated_records.append({
            "sentence": orig_sentence,
            "aspect": aspect,
            "polarity": predicted_sentiment
        })

    except Exception as e:
        print(f"Error annotating row: {e}")

df_annotated = pd.DataFrame(annotated_records)
df_augmented = pd.concat([df_train, df_annotated], ignore_index=True)

In [None]:
output_path = f"/content/drive/My Drive/Master Thesis/Data/DA/LLM_A_{dataset_choice}_{model_choice}.csv"
df_augmented.to_csv(output_path, index=False)
print(f"File saved to {output_path}")

display(df_augmented)