In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/absa_project"
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "models")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("DATA_DIR:", DATA_DIR)
print("MODEL_DIR:", MODEL_DIR)


BASE_DIR: /content/drive/MyDrive/absa_project
DATA_DIR: /content/drive/MyDrive/absa_project/data
MODEL_DIR: /content/drive/MyDrive/absa_project/models


In [None]:
!pip install -q kaggle
from google.colab import files
files.upload()   # choose kaggle.json

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


In [None]:
!kaggle datasets download -d ali1510/absa-dataset -p "$DATA_DIR"


Dataset URL: https://www.kaggle.com/datasets/ali1510/absa-dataset
License(s): unknown
absa-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
print(DATA_DIR)


/content/drive/MyDrive/absa_project/data


In [None]:
!ls $DATA_DIR

absa-dataset.zip  Processed_Annotated_Data_Final.csv


In [None]:
!pip install -q numpy pandas scikit-learn spacy transformers datasets torch
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")


In [None]:
import os, glob

print("DATA_DIR =", DATA_DIR)
print("\nFiles in DATA_DIR:")
print(os.listdir(DATA_DIR))

print("\nCSV files in DATA_DIR:")
csv_files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
print(csv_files)


DATA_DIR = /content/drive/MyDrive/absa_project/data

Files in DATA_DIR:
['absa-dataset.zip', 'Processed_Annotated_Data_Final.csv']

CSV files in DATA_DIR:
['/content/drive/MyDrive/absa_project/data/Processed_Annotated_Data_Final.csv']


In [None]:
!unzip "/content/drive/MyDrive/absa_project/data/absa-dataset.zip" -d "/content/drive/MyDrive/absa_project/data"


Archive:  /content/drive/MyDrive/absa_project/data/absa-dataset.zip
replace /content/drive/MyDrive/absa_project/data/Processed_Annotated_Data_Final.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/drive/MyDrive/absa_project/data/Processed_Annotated_Data_Final.csv  


In [None]:
import os, glob

print("Files in DATA_DIR:", os.listdir(DATA_DIR))
print("\nCSV files:", glob.glob(DATA_DIR + "/*.csv"))


Files in DATA_DIR: ['absa-dataset.zip', 'Processed_Annotated_Data_Final.csv']

CSV files: ['/content/drive/MyDrive/absa_project/data/Processed_Annotated_Data_Final.csv']


In [None]:
os.listdir(DATA_DIR)


['absa-dataset.zip', 'Processed_Annotated_Data_Final.csv']

In [None]:
import pandas as pd
import os

CSV_PATH = os.path.join(DATA_DIR, "Processed_Annotated_Data_Final.csv")
print("Using CSV_PATH =", CSV_PATH)

df_raw = pd.read_csv(CSV_PATH)
df_raw.head(), df_raw.columns, df_raw.shape


Using CSV_PATH = /content/drive/MyDrive/absa_project/data/Processed_Annotated_Data_Final.csv


(        ID                                           SENTENCE  \
 0  GPT-001  the food was delicious and beautifully present...   
 1  GPT-002  the ambience was cozy and inviting and the sta...   
 2  GPT-003  i highly recommend trying the signature cockta...   
 3  GPT-004  the staff was attentive and constantly refilli...   
 4  GPT-005  the restaurant offered a wide selection of jui...   
 
           CATEGORY                             ASPECTS  \
 0    FOOD, SERVICE                ['TASTE', 'GENERAL']   
 1         AMBIENCE         ['ATMOSPHERE', 'BEHAVIOUR']   
 2      DRINK, FOOD  ['PRESENTATION', 'RECOMMENDATION']   
 3  AMBIENCE, STAFF         ['ATMOSPHERE', 'BEHAVIOUR']   
 4             FOOD   ['MENU', 'PRESENTATION', 'TASTE']   
 
              ASPECT CATEGORY                             SENTIMENT  \
 0        ['FOOD', 'SERVICE']              ['POSITIVE', 'POSITIVE']   
 1  ['RESTAURANT', 'SERVICE']              ['POSITIVE', 'POSITIVE']   
 2           ['FOOD', 'FOOD']    

In [None]:
import numpy as np

df = df_raw.copy()

cols = list(df.columns)
print("All columns:", cols)

# Try to guess text column (sentence/review/etc.)
text_candidates = [c for c in cols if c.lower() in ["sentence", "review", "text", "comment"]]
aspect_candidates = [c for c in cols if "aspect" in c.lower()]
polarity_candidates = [c for c in cols if any(t in c.lower() for t in ["polarity", "sentiment", "label"])]

print("Text candidates    :", text_candidates)
print("Aspect candidates  :", aspect_candidates)
print("Polarity candidates:", polarity_candidates)

# Pick first match from each; if not found, this will crash and we know something's off
text_col = text_candidates[0]
aspect_col = aspect_candidates[0]
polarity_col = polarity_candidates[0]

print("\nChosen mapping:")
print(" sentence ->", text_col)
print(" aspect   ->", aspect_col)
print(" polarity ->", polarity_col)

# Standardize to: sentence / aspect / polarity
df = df.rename(columns={
    text_col: "sentence",
    aspect_col: "aspect",
    polarity_col: "polarity"
})

df = df[["sentence", "aspect", "polarity"]].dropna()
df.head(), df["polarity"].value_counts()


All columns: ['ID', 'SENTENCE', 'CATEGORY', 'ASPECTS', 'ASPECT CATEGORY', 'SENTIMENT', 'ASPECTS_WITH_ASPECT_CATEGORY', 'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT']
Text candidates    : ['SENTENCE']
Aspect candidates  : ['ASPECTS', 'ASPECT CATEGORY', 'ASPECTS_WITH_ASPECT_CATEGORY', 'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT']
Polarity candidates: ['SENTIMENT', 'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT']

Chosen mapping:
 sentence -> SENTENCE
 aspect   -> ASPECTS
 polarity -> SENTIMENT


(                                            sentence  \
 0  the food was delicious and beautifully present...   
 1  the ambience was cozy and inviting and the sta...   
 2  i highly recommend trying the signature cockta...   
 3  the staff was attentive and constantly refilli...   
 4  the restaurant offered a wide selection of jui...   
 
                                aspect                              polarity  
 0                ['TASTE', 'GENERAL']              ['POSITIVE', 'POSITIVE']  
 1         ['ATMOSPHERE', 'BEHAVIOUR']              ['POSITIVE', 'POSITIVE']  
 2  ['PRESENTATION', 'RECOMMENDATION']              ['POSITIVE', 'POSITIVE']  
 3         ['ATMOSPHERE', 'BEHAVIOUR']              ['POSITIVE', 'POSITIVE']  
 4   ['MENU', 'PRESENTATION', 'TASTE']  ['POSITIVE', 'POSITIVE', 'POSITIVE']  ,
 polarity
 ['POSITIVE']                                                                5992
 ['POSITIVE', 'POSITIVE']                                                    1534
 ['NEGA

In [None]:
print(df.head())

                                            sentence  \
0  the food was delicious and beautifully present...   
1  the ambience was cozy and inviting and the sta...   
2  i highly recommend trying the signature cockta...   
3  the staff was attentive and constantly refilli...   
4  the restaurant offered a wide selection of jui...   

                               aspect                              polarity  
0                ['TASTE', 'GENERAL']              ['POSITIVE', 'POSITIVE']  
1         ['ATMOSPHERE', 'BEHAVIOUR']              ['POSITIVE', 'POSITIVE']  
2  ['PRESENTATION', 'RECOMMENDATION']              ['POSITIVE', 'POSITIVE']  
3         ['ATMOSPHERE', 'BEHAVIOUR']              ['POSITIVE', 'POSITIVE']  
4   ['MENU', 'PRESENTATION', 'TASTE']  ['POSITIVE', 'POSITIVE', 'POSITIVE']  


In [None]:
print("Shape:", df_raw.shape)
print("\nColumns:")
print(df_raw.columns.tolist())

print("\nFirst 5 rows:")
print(df_raw.head())


Shape: (9699, 8)

Columns:
['ID', 'SENTENCE', 'CATEGORY', 'ASPECTS', 'ASPECT CATEGORY', 'SENTIMENT', 'ASPECTS_WITH_ASPECT_CATEGORY', 'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT']

First 5 rows:
        ID                                           SENTENCE  \
0  GPT-001  the food was delicious and beautifully present...   
1  GPT-002  the ambience was cozy and inviting and the sta...   
2  GPT-003  i highly recommend trying the signature cockta...   
3  GPT-004  the staff was attentive and constantly refilli...   
4  GPT-005  the restaurant offered a wide selection of jui...   

          CATEGORY                             ASPECTS  \
0    FOOD, SERVICE                ['TASTE', 'GENERAL']   
1         AMBIENCE         ['ATMOSPHERE', 'BEHAVIOUR']   
2      DRINK, FOOD  ['PRESENTATION', 'RECOMMENDATION']   
3  AMBIENCE, STAFF         ['ATMOSPHERE', 'BEHAVIOUR']   
4             FOOD   ['MENU', 'PRESENTATION', 'TASTE']   

             ASPECT CATEGORY                             SENTIMENT

In [None]:
import numpy as np

df = df_raw.copy()

cols = list(df.columns)
print("All columns:", cols)

# Try to guess by name patterns
text_candidates = [c for c in cols if c.lower() in ["sentence", "review", "text", "comment", "review_text"]]
aspect_candidates = [c for c in cols if ("aspect" in c.lower()) or ("category" in c.lower())]
polarity_candidates = [c for c in cols if any(t in c.lower() for t in ["polarity", "sentiment", "label", "opinion"])]

print("\nText candidates    :", text_candidates)
print("Aspect candidates  :", aspect_candidates)
print("Polarity candidates:", polarity_candidates)


All columns: ['ID', 'SENTENCE', 'CATEGORY', 'ASPECTS', 'ASPECT CATEGORY', 'SENTIMENT', 'ASPECTS_WITH_ASPECT_CATEGORY', 'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT']

Text candidates    : ['SENTENCE']
Aspect candidates  : ['CATEGORY', 'ASPECTS', 'ASPECT CATEGORY', 'ASPECTS_WITH_ASPECT_CATEGORY', 'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT']
Polarity candidates: ['SENTIMENT', 'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT']


In [None]:
import pandas as pd

# Start fresh from df_raw
df = df_raw.copy()

text_col = "SENTENCE"
aspect_col = "CATEGORY"
polarity_col = "SENTIMENT"

print("Using:")
print(" sentence ->", text_col)
print(" aspect   ->", aspect_col)
print(" polarity ->", polarity_col)

# Rename to standard names
df = df.rename(columns={
    text_col: "sentence",
    aspect_col: "aspect",
    polarity_col: "polarity"
})

# Keep only what we need, drop rows with missing values
df = df[["sentence", "aspect", "polarity"]].dropna()

# Basic cleaning
df["sentence"] = df["sentence"].astype(str).str.strip()
df["aspect"]   = df["aspect"].astype(str).str.strip()
df["polarity"] = df["polarity"].astype(str).str.lower().str.strip()

print("After cleaning, shape:", df.shape)
print("\nSample rows:")
print(df.head())
print("\nUnique polarity values:")
print(df["polarity"].value_counts(dropna=False))


Using:
 sentence -> SENTENCE
 aspect   -> CATEGORY
 polarity -> SENTIMENT
After cleaning, shape: (9699, 3)

Sample rows:
                                            sentence           aspect  \
0  the food was delicious and beautifully present...    FOOD, SERVICE   
1  the ambience was cozy and inviting and the sta...         AMBIENCE   
2  i highly recommend trying the signature cockta...      DRINK, FOOD   
3  the staff was attentive and constantly refilli...  AMBIENCE, STAFF   
4  the restaurant offered a wide selection of jui...             FOOD   

                               polarity  
0              ['positive', 'positive']  
1              ['positive', 'positive']  
2              ['positive', 'positive']  
3              ['positive', 'positive']  
4  ['positive', 'positive', 'positive']  

Unique polarity values:
polarity
['positive']                                                                5992
['positive', 'positive']                                                 

In [None]:
# get all distinct sentiment labels present
unique_labels = sorted(df["polarity"].unique())
print("Detected sentiment labels:", unique_labels)

# build mapping: text label -> integer
label_map = {lab: i for i, lab in enumerate(unique_labels)}
print("Label map:", label_map)

# apply mapping
df["label"] = df["polarity"].map(label_map)

print("\nFinal df sample:")
print(df[["sentence", "aspect", "polarity", "label"]].head())
print("Final shape:", df.shape)


Detected sentiment labels: ["['negative', 'negative', 'negative', 'negative']", "['negative', 'negative', 'negative']", "['negative', 'negative', 'neutral']", "['negative', 'negative', 'positive']", "['negative', 'negative']", "['negative', 'neutral', 'negative']", "['negative', 'neutral', 'positive', 'negative']", "['negative', 'neutral']", "['negative', 'positive', 'negative']", "['negative', 'positive', 'positive', 'positive']", "['negative', 'positive', 'positive']", "['negative', 'positive']", "['negative']", "['neutral', 'negative']", "['neutral', 'neutral']", "['neutral', 'positive']", "['neutral']", '[\'positive"\']', '[\'positive"345315\']', "['positive', 'negative', 'negative']", "['positive', 'negative', 'positive']", "['positive', 'negative']", "['positive', 'neutral', 'positive']", "['positive', 'neutral']", "['positive', 'positive', 'negative', 'positive']", "['positive', 'positive', 'negative']", "['positive', 'positive', 'neutral', 'positive']", "['positive', 'positive'

In [None]:
print("Polarity counts:")
print(df["polarity"].value_counts())

print("\nLabel counts:")
print(df["label"].value_counts())


Polarity counts:
polarity
['positive']                                                                5992
['positive', 'positive']                                                    1534
['negative']                                                                 772
['neutral']                                                                  649
['positive', 'negative']                                                     246
['positive', 'positive', 'positive']                                         215
['negative', 'negative']                                                      98
['negative', 'positive']                                                      37
['positive', 'positive', 'negative']                                          36
['positive', 'positive', 'positive', 'positive']                              24
['positive', 'neutral']                                                       23
['positive', 'negative', 'positive']                                           8
['

In [None]:
# find labels with too few samples
label_counts = df["label"].value_counts()
rare_labels = label_counts[label_counts < 2].index
print("Rare labels (will be dropped):", list(rare_labels))

# drop those rows
df = df[~df["label"].isin(rare_labels)].reset_index(drop=True)

print("After dropping rare labels, shape:", df.shape)
print(df["polarity"].value_counts())


Rare labels (will be dropped): [10, 24, 17, 27, 5, 2, 26, 18, 9, 6, 0]
After dropping rare labels, shape: (9688, 4)
polarity
['positive']                                                                5992
['positive', 'positive']                                                    1534
['negative']                                                                 772
['neutral']                                                                  649
['positive', 'negative']                                                     246
['positive', 'positive', 'positive']                                         215
['negative', 'negative']                                                      98
['negative', 'positive']                                                      37
['positive', 'positive', 'negative']                                          36
['positive', 'positive', 'positive', 'positive']                              24
['positive', 'neutral']                                          

In [None]:
# rebuild mapping from remaining polarities
unique_labels = sorted(df["polarity"].unique())
label_map = {lab: i for i, lab in enumerate(unique_labels)}
print("New label_map:", label_map)

# overwrite numeric label column
df["label"] = df["polarity"].map(label_map)

print("New label counts:")
print(df["label"].value_counts())


New label_map: {"['negative', 'negative', 'negative']": 0, "['negative', 'negative', 'positive']": 1, "['negative', 'negative']": 2, "['negative', 'neutral']": 3, "['negative', 'positive', 'negative']": 4, "['negative', 'positive']": 5, "['negative']": 6, "['neutral', 'negative']": 7, "['neutral', 'neutral']": 8, "['neutral', 'positive']": 9, "['neutral']": 10, "['positive', 'negative', 'negative']": 11, "['positive', 'negative', 'positive']": 12, "['positive', 'negative']": 13, "['positive', 'neutral', 'positive']": 14, "['positive', 'neutral']": 15, "['positive', 'positive', 'negative']": 16, "['positive', 'positive', 'positive', 'negative']": 17, "['positive', 'positive', 'positive', 'positive', 'positive', 'positive']": 18, "['positive', 'positive', 'positive', 'positive', 'positive']": 19, "['positive', 'positive', 'positive', 'positive']": 20, "['positive', 'positive', 'positive']": 21, "['positive', 'positive']": 22, "['positive']": 23}
New label counts:
label
23    5992
22    1

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df[["sentence", "aspect", "label"]],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

train_df.shape, val_df.shape


((7750, 3), (1938, 3))

In [None]:
from datasets import Dataset
from transformers import BertTokenizerFast

model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# convert pandas → HF datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

def tokenize_fn(example):
    return tokenizer(
        example["sentence"],
        example["aspect"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_tokenized = train_dataset.map(tokenize_fn, batched=True)
val_tokenized   = val_dataset.map(tokenize_fn, batched=True)

# rename 'label' → 'labels' for Trainer
train_tokenized = train_tokenized.rename_column("label", "labels")
val_tokenized   = val_tokenized.rename_column("label", "labels")

# set format for PyTorch
train_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)
val_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)

train_tokenized[0]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/7750 [00:00<?, ? examples/s]

Map:   0%|          | 0/1938 [00:00<?, ? examples/s]

{'labels': tensor(23),
 'input_ids': tensor([  101,  1996,  2311,  1055, 23788,  2024,  2200,  9561,  1998,  4867,
           102,  4825,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0, 

In [None]:
# Rebuild label_map from your cleaned dataframe df

unique_labels = sorted(df["polarity"].unique())
label_map = {lab: i for i, lab in enumerate(unique_labels)}

print("Detected labels:", unique_labels)
print("label_map:", label_map)

# Make sure df['label'] matches this mapping
df["label"] = df["polarity"].map(label_map)

print(df["label"].value_counts())


Detected labels: ["['negative', 'negative', 'negative']", "['negative', 'negative', 'positive']", "['negative', 'negative']", "['negative', 'neutral']", "['negative', 'positive', 'negative']", "['negative', 'positive']", "['negative']", "['neutral', 'negative']", "['neutral', 'neutral']", "['neutral', 'positive']", "['neutral']", "['positive', 'negative', 'negative']", "['positive', 'negative', 'positive']", "['positive', 'negative']", "['positive', 'neutral', 'positive']", "['positive', 'neutral']", "['positive', 'positive', 'negative']", "['positive', 'positive', 'positive', 'negative']", "['positive', 'positive', 'positive', 'positive', 'positive', 'positive']", "['positive', 'positive', 'positive', 'positive', 'positive']", "['positive', 'positive', 'positive', 'positive']", "['positive', 'positive', 'positive']", "['positive', 'positive']", "['positive']"]
label_map: {"['negative', 'negative', 'negative']": 0, "['negative', 'negative', 'positive']": 1, "['negative', 'negative']": 

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df[['sentence', 'aspect', 'label']],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

train_df.shape, val_df.shape


((7750, 3), (1938, 3))

In [None]:
num_labels = len(label_map)
print("num_labels:", num_labels, " | label_map:", label_map)


num_labels: 24  | label_map: {"['negative', 'negative', 'negative']": 0, "['negative', 'negative', 'positive']": 1, "['negative', 'negative']": 2, "['negative', 'neutral']": 3, "['negative', 'positive', 'negative']": 4, "['negative', 'positive']": 5, "['negative']": 6, "['neutral', 'negative']": 7, "['neutral', 'neutral']": 8, "['neutral', 'positive']": 9, "['neutral']": 10, "['positive', 'negative', 'negative']": 11, "['positive', 'negative', 'positive']": 12, "['positive', 'negative']": 13, "['positive', 'neutral', 'positive']": 14, "['positive', 'neutral']": 15, "['positive', 'positive', 'negative']": 16, "['positive', 'positive', 'positive', 'negative']": 17, "['positive', 'positive', 'positive', 'positive', 'positive', 'positive']": 18, "['positive', 'positive', 'positive', 'positive', 'positive']": 19, "['positive', 'positive', 'positive', 'positive']": 20, "['positive', 'positive', 'positive']": 21, "['positive', 'positive']": 22, "['positive']": 23}


In [None]:
import pandas as pd
import os, ast

DATA_DIR = "/content/drive/MyDrive/absa_project/data"
CSV_PATH = os.path.join(DATA_DIR, "Processed_Annotated_Data_Final.csv")

df_raw = pd.read_csv(CSV_PATH)
df_raw.shape, df_raw.columns


((9699, 8),
 Index(['ID', 'SENTENCE', 'CATEGORY', 'ASPECTS', 'ASPECT CATEGORY', 'SENTIMENT',
        'ASPECTS_WITH_ASPECT_CATEGORY',
        'ASPECTS_WITH_ASPECT_CATEGORY_AND_SENTIMENT'],
       dtype='object'))

In [None]:
import ast

def parse_list(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            return []
    elif isinstance(x, (list, tuple)):
        return list(x)
    else:
        return []

rows = []

for _, row in df_raw.iterrows():
    sentence = str(row["SENTENCE"]).strip()
    aspects   = parse_list(row["ASPECTS"])
    sentiments = parse_list(row["SENTIMENT"])

    # pair up each aspect with its sentiment
    for asp, pol in zip(aspects, sentiments):
        if asp is None or pol is None:
            continue
        asp = str(asp).strip()
        pol = str(pol).strip().lower()
        rows.append({
            "sentence": sentence,
            "aspect": asp,
            "polarity": pol
        })

df = pd.DataFrame(rows)
df.shape, df.head()


((12351, 3),
                                             sentence        aspect  polarity
 0  the food was delicious and beautifully present...         TASTE  positive
 1  the food was delicious and beautifully present...       GENERAL  positive
 2  the ambience was cozy and inviting and the sta...    ATMOSPHERE  positive
 3  the ambience was cozy and inviting and the sta...     BEHAVIOUR  positive
 4  i highly recommend trying the signature cockta...  PRESENTATION  positive)

In [None]:
df["sentence"] = df["sentence"].astype(str).str.strip()
df["aspect"]   = df["aspect"].astype(str).str.strip()
df["polarity"] = df["polarity"].astype(str).str.lower().str.strip()

print("Shape:", df.shape)
print(df["polarity"].value_counts())


Shape: (12351, 3)
polarity
positive           10276
negative            1362
neutral              711
positive"              1
positive"345315        1
Name: count, dtype: int64


In [None]:
# Build mapping from string labels to integers
unique_labels = sorted(df["polarity"].unique())
label_map = {lab: i for i, lab in enumerate(unique_labels)}

print("unique_labels:", unique_labels)
print("label_map:", label_map)

df["label"] = df["polarity"].map(label_map)
df[["sentence", "aspect", "polarity", "label"]].head()


unique_labels: ['negative', 'neutral', 'positive', 'positive"', 'positive"345315']
label_map: {'negative': 0, 'neutral': 1, 'positive': 2, 'positive"': 3, 'positive"345315': 4}


Unnamed: 0,sentence,aspect,polarity,label
0,the food was delicious and beautifully present...,TASTE,positive,2
1,the food was delicious and beautifully present...,GENERAL,positive,2
2,the ambience was cozy and inviting and the sta...,ATMOSPHERE,positive,2
3,the ambience was cozy and inviting and the sta...,BEHAVIOUR,positive,2
4,i highly recommend trying the signature cockta...,PRESENTATION,positive,2


In [None]:
label_counts = df["label"].value_counts()
rare_labels = label_counts[label_counts < 2].index
print("Dropped rare labels (if any):", list(rare_labels))

df = df[~df["label"].isin(rare_labels)].reset_index(drop=True)

# Rebuild label_map after dropping
unique_labels = sorted(df["polarity"].unique())
label_map = {lab: i for i, lab in enumerate(unique_labels)}
df["label"] = df["polarity"].map(label_map)

print("Final label_map:", label_map)
print(df["label"].value_counts())


Dropped rare labels (if any): [3, 4]
Final label_map: {'negative': 0, 'neutral': 1, 'positive': 2}
label
2    10276
0     1362
1      711
Name: count, dtype: int64


In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df[["sentence", "aspect", "label"]],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

train_df.shape, val_df.shape


((9879, 3), (2470, 3))

In [None]:
!pip install -U "transformers"


Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.2
    Uninstalling transformers-4.57.2:
      Successfully uninstalled transformers-4.57.2
Successfully installed transformers-4.57.3


In [None]:
from datasets import Dataset
from transformers import BertTokenizerFast

model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# convert pandas → HF datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df.reset_index(drop=True))

def tokenize_fn(example):
    return tokenizer(
        example["sentence"],
        example["aspect"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_tokenized = train_dataset.map(tokenize_fn, batched=True)
val_tokenized   = val_dataset.map(tokenize_fn, batched=True)

# rename 'label' → 'labels' for Trainer
train_tokenized = train_tokenized.rename_column("label", "labels")
val_tokenized   = val_tokenized.rename_column("label", "labels")

# set format for PyTorch
train_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)
val_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "token_type_ids", "labels"]
)

train_tokenized[0]


Map:   0%|          | 0/9879 [00:00<?, ? examples/s]

Map:   0%|          | 0/2470 [00:00<?, ? examples/s]

{'labels': tensor(2),
 'input_ids': tensor([  101,  2256, 15610, 11333,  2006,  2391,  2007,  2673,   102,  3325,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,  

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

num_labels = len(label_map)
print("num_labels:", num_labels, " | label_map:", label_map)

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/absa_bert_output",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50
    # no evaluation_strategy, save_strategy, load_best_model_at_end etc.
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
)


num_labels: 3  | label_map: {'negative': 0, 'neutral': 1, 'positive': 2}


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "true"


In [None]:
trainer.train()


  | |_| | '_ \/ _` / _` |  _/ -_)


Step,Training Loss
50,0.6221
100,0.3909
150,0.3303


In [None]:
metrics = trainer.evaluate()
metrics


In [None]:
import os, json

# make sure MODEL_DIR exists, if not:
# MODEL_DIR = "/content/drive/MyDrive/absa_project/models"

SAVE_DIR = os.path.join(MODEL_DIR, "absa_bert_model")
os.makedirs(SAVE_DIR, exist_ok=True)

# save model + tokenizer from Trainer
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

# save label_map
label_map_path = os.path.join(SAVE_DIR, "label_map.json")
with open(label_map_path, "w") as f:
    json.dump(label_map, f)

SAVE_DIR, label_map_path


In [None]:
from typing import List

def clean_chunk(text: str) -> str:
    text = text.lower().strip()
    for art in ["the ", "a ", "an "]:
        if text.startswith(art):
            text = text[len(art):]
    return text

# Aspects we never want (pure pronouns etc.)
ASPECT_STOP_WORDS = {
    "it", "this", "that", "these", "those",
    "they", "them", "he", "she", "we", "you", "i"
}

def extract_candidate_aspects(sentence: str):
    doc = nlp(sentence)
    aspects = []

    for chunk in doc.noun_chunks:
        # Remove determiners and pronouns from the chunk:
        # e.g. "the dress" -> ["dress"], "this phone" -> ["phone"]
        content_tokens = [t for t in chunk if t.pos_ not in {"DET", "PRON"}]

        # If nothing left after removing DET/PRON, skip
        if not content_tokens:
            continue

        # Build aspect text from remaining tokens
        text = " ".join(t.text for t in content_tokens).strip().lower()

        # Skip empty or pure stop/aspect-stop words
        if not text or text in ASPECT_STOP_WORDS:
            continue

        # Skip chunks that are obviously numeric/time-like
        if any(t.like_num for t in content_tokens):
            continue
        if any(word in text for word in ("month", "months", "day", "days", "year", "years", "time")):
            continue

        # Keep only short phrases (1–3 words), to allow "battery life" etc.
        if not (1 <= len(text.split()) <= 3):
            continue

        aspects.append(text)

    # De-duplicate while preserving order
    seen = set()
    uniq = []
    for a in aspects:
        if a not in seen:
            seen.add(a)
            uniq.append(a)

    return uniq


# quick check
extract_candidate_aspects("The light is not working properly but the stand looks pretty.")


In [None]:
import re

# verbs that really carry sentiment
SENTIMENT_VERBS = {
    "love", "like", "dislike", "hate", "enjoy",
    "recommend", "prefer", "admire", "appreciate",
    "annoy", "disappoint", "suck", "rock"
}

def get_opinion_phrase(sentence: str, aspect: str) -> str:
    sentence_low = sentence.lower()
    aspect_low = aspect.lower()

    # clause split
    clauses = re.split(r'\s*(?:,| and | but |;)\s*', sentence)
    target_clause = None
    for c in clauses:
        if aspect_low in c.lower():
            target_clause = c
            break
    if target_clause is None:
        target_clause = sentence

    doc = nlp(target_clause)
    tokens = list(doc)

    opinion_pos = {"ADJ", "ADV"}
    good_verbs = {"struggle", "struggles", "love", "hate", "enjoy", "recommend"}

    opinion_tokens = []

    for tok in tokens:
        if tok.pos_ in opinion_pos and tok.is_alpha:
            opinion_tokens.append(tok)
        elif tok.pos_ == "VERB" and tok.lemma_ in good_verbs:
            # include verb + following modifiers
            span = [tok.text]
            for child in tok.children:
                if child.dep_ in {"prep", "prt", "dobj"}:
                    span.append(child.text)
                    for gchild in child.children:
                        span.append(gchild.text)
            return " ".join(span)

    if not opinion_tokens:
        return ""

    # multi-adjective phrase
    phrase = " ".join(tok.text for tok in opinion_tokens)
    return phrase




    # 2) Window around aspect
    center_idx = (aspect_start_idx + aspect_end_idx - 1) // 2
    window_start = max(0, center_idx - 5)
    window_end   = min(len(doc), center_idx + 7)
    window_tokens = list(doc[window_start:window_end])

    opinion_pos = {"ADJ", "ADV", "VERB"}

    # 3) Split window by "but"
    but_tokens = [t for t in window_tokens if t.text.lower() == "but"]
    if but_tokens:
        but_idx = but_tokens[0].i
    else:
        but_idx = None

    before_tokens, after_tokens = [], []
    for tok in window_tokens:
        if tok.pos_ in opinion_pos and tok.is_alpha:
            if but_idx is not None and tok.i > but_idx:
                after_tokens.append(tok)
            else:
                before_tokens.append(tok)

    # 4) Build phrases
    def phrase_from(tokens):
        if not tokens:
            return ""
        tokens = sorted(tokens, key=lambda t: t.i)
        return " ".join(t.text for t in tokens)

    phrase_before = phrase_from(before_tokens)
    phrase_after  = phrase_from(after_tokens)

    # 5) Decide based on where the aspect is relative to "but"
    if but_idx is not None:
        if aspect_start_idx > but_idx:
            # aspect appears AFTER "but" → only take the later opinion
            return phrase_after or phrase_before
        else:
            # aspect appears BEFORE "but" → show both sides
            if phrase_before and phrase_after:
                return f"{phrase_before} ; {phrase_after}"
            elif phrase_after:
                return phrase_after
            else:
                return phrase_before
    else:
        # no "but": just combine everything
        if phrase_before and phrase_after:
            return f"{phrase_before} ; {phrase_after}"
        elif phrase_after:
            return phrase_after
        else:
            return phrase_before


In [None]:
s = "The battery life is absolutely terrible but the screen is really beautiful."
print("Battery:", get_opinion_phrase(s, "battery life"))
print("Screen :", get_opinion_phrase(s, "screen"))


In [None]:
import torch, json
from transformers import BertTokenizerFast, BertForSequenceClassification
import os

LOAD_DIR = os.path.join(MODEL_DIR, "absa_bert_model")

tokenizer = BertTokenizerFast.from_pretrained(LOAD_DIR)
model = BertForSequenceClassification.from_pretrained(LOAD_DIR)

with open(os.path.join(LOAD_DIR, "label_map.json"), "r") as f:
    label_map_loaded = json.load(f)

id2label = {v: k for k, v in label_map_loaded.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

id2label


In [None]:
def predict_aspect_sentiment(sentence: str, aspect: str):
    encoded = tokenizer(
        sentence,
        aspect,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        outputs = model(**encoded)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)

    pred_id = torch.argmax(probs, dim=-1).item()
    confidence = probs[0, pred_id].item()
    return id2label[pred_id], float(confidence)

# sanity check
predict_aspect_sentiment(
    "The battery life is terrible but the screen is beautiful.",
    "battery life"
)


In [None]:
PRONOUNS = {"it", "they", "them", "its", "their"}
# List of real sentiment adjectives/verbs commonly found in review data
SENTIMENT_LEXICON = {
    "good","great","bad","terrible","amazing","awesome","poor","excellent",
    "slow","weak","beautiful","tasty","horrible","dirty","clean","bright",
    "sharp","fit","fits","fitting","happy","sad","disappointing","recommend",
    "love","hate","enjoy","awful"
}

def contains_sentiment_word(text):
    doc = nlp(text)
    return any(tok.lemma_.lower() in SENTIMENT_LEXICON for tok in doc)


def extract_opinion_words(sent_text):
    doc = nlp(sent_text)
    opinion_pos = {"ADJ", "ADV"}
    return [tok.text for tok in doc if tok.pos_ in opinion_pos]


def absa_pipeline(review: str):
    doc = nlp(review)
    all_results = []
    previous_aspects = []  # remembers aspects from previous sentences

    for sent in doc.sents:
        sent_text = sent.text.strip()
        if not sent_text:
            continue

        # Extract aspects in this sentence
        aspects = extract_candidate_aspects(sent_text)

        # Extract opinion words in this sentence
        opinion_words = extract_opinion_words(sent_text)

        # Check for pronouns
        tokens_lower = [t.text.lower() for t in nlp(sent_text)]
        contains_pronoun = any(t in PRONOUNS for t in tokens_lower)

        # ---------------------------------------------------------
        # CASE 1: Sentence contains new aspect(s)
        # ---------------------------------------------------------
        if aspects:
            previous_aspects = aspects[:]  # update memory

            for aspect in aspects:
                sentiment, conf = predict_aspect_sentiment(sent_text, aspect)
                opinion = get_opinion_phrase(sent_text, aspect)

# FINAL POST-PROCESS OVERRIDE
                if not opinion:
                  if not contains_sentiment_word(sent_text):
                    sentiment = "neutral"
                    conf = 0.5



                all_results.append({
                    "sentence": sent_text,
                    "aspect": aspect,
                    "sentiment": sentiment,
                    "confidence": round(conf, 3),
                    "opinion_phrase": opinion
                })

        # ---------------------------------------------------------
        # CASE 2: No aspect here, but pronoun + opinion → inherit previous aspect
        # ---------------------------------------------------------
        elif contains_pronoun and opinion_words and previous_aspects:
            for aspect in previous_aspects:
                sentiment, conf = predict_aspect_sentiment(sent_text, aspect)
                opinion = get_opinion_phrase(sent_text, aspect)

                if not opinion:
                    sentiment = "neutral"
                    conf = 0.5

                all_results.append({
                    "sentence": sent_text,
                    "aspect": aspect,
                    "sentiment": sentiment,
                    "confidence": round(conf, 3),
                    "opinion_phrase": opinion
                })

    return all_results



example = "The battery life is terrible but the screen and speakers are amazing for this price. The notebook looks very pretty."
absa_pipeline(example)


In [None]:
def print_absa(sentence: str):
    print(f"Review: {sentence}\n")
    results = absa_pipeline(sentence)
    if not results:
        print("No aspects found.")
        return
    for r in results:
        print(f"Aspect      : {r['aspect']}")
        print(f"Sentiment   : {r['sentiment']} (conf={r['confidence']})")
        if r['opinion_phrase']:
            print(f"Opinion span: {r['opinion_phrase']}")
        print("-" * 40)

print_absa("The food was amazing but the service was slow and the ambiance was average.")
print_absa("I love the camera but the battery drains too fast.")


In [None]:
!pip install gradio


In [None]:
import gradio as gr
import pandas as pd

def analyze_review(review: str):
    review = review.strip()
    if not review:
        empty_df = pd.DataFrame(columns=["aspect", "sentiment", "confidence", "opinion_phrase"])
        return "Please enter a product review.", empty_df

    results = absa_pipeline(review)  # uses your existing function

    if not results:
        empty_df = pd.DataFrame(columns=["aspect", "sentiment", "confidence", "opinion_phrase"])
        return "No aspects found in the review.", empty_df

    # Build a simple text summary
    lines = []
    for r in results:
        aspect = r.get("aspect", "")
        sent   = r.get("sentiment", "")
        conf   = r.get("confidence", 0)
        opin   = r.get("opinion_phrase", "")
        line = f"- **{aspect}** → **{sent}** (conf={conf})"
        if opin:
            line += f" &nbsp; · &nbsp; _{opin}_"
        lines.append(line)
    summary = "\n".join(lines)

    # Convert to DataFrame for nice table
    df_results = pd.DataFrame(results)
    # Ensure consistent column order if available
    for col in ["sentence", "aspect", "sentiment", "confidence", "opinion_phrase"]:
        if col not in df_results.columns:
            df_results[col] = ""
    df_results = df_results[["sentence", "aspect", "sentiment", "confidence", "opinion_phrase"]]

    return summary, df_results


custom_css = """
#title {
    text-align: center;
    padding-top: 0.5rem;
    padding-bottom: 0.5rem;
}
.absa-card {
    background: radial-gradient(circle at top left, #fdf2ff, #eef9ff);
    border-radius: 14px;
    padding: 1rem 1.25rem;
    border: 1px solid rgba(0,0,0,0.06);
    box-shadow: 0 8px 18px rgba(15, 23, 42, 0.06);
}
.small-muted {
    font-size: 0.9rem;
    color: #6b7280;
}
"""

with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
    # Header
    gr.Markdown(
        """
<div id="title">
  <h1>🧠 Aspect-Based Sentiment Analysis (ABSA)</h1>
  <p class="small-muted">
    Analyze reviews at a finer level: instead of one sentiment for the whole text,
    ABSA finds <b>aspects</b> (battery, screen, service, ambience...) and tells you
    how the user feels about <i>each</i> of them, with a short opinion phrase extracted from context.
  </p>
</div>
        """,
        elem_classes="absa-header"
    )

    with gr.Row():
        # Left: main interaction
        with gr.Column(scale=3):
            gr.Markdown(
                """
<div class="absa-card">
  <h3>🔍 Try it out</h3>
  <p class="small-muted">
    Paste a review below. The system will:
    <ul>
      <li>Detect important aspects in the text (e.g., <em>battery life</em>, <em>screen</em>, <em>service</em>)</li>
      <li>Predict sentiment for each aspect (positive / neutral / negative)</li>
      <li>Extract a short opinion phrase that explains <em>why</em> that sentiment was assigned</li>
    </ul>
  </p>
</div>
                """
            )

            review_input = gr.Textbox(
                lines=5,
                label="Enter your review",
                placeholder="Example: The battery life is terrible but the screen and speakers are amazing for this price.",
            )

            analyze_button = gr.Button("⚙️ Analyze review", variant="primary")

            summary_output = gr.Markdown(label="Per-aspect analysis")

            table_output = gr.Dataframe(
                headers=["sentence", "aspect", "sentiment", "confidence", "opinion_phrase"],
                label="Detailed results table",
                wrap=True
            )

            analyze_button.click(
                fn=analyze_review,
                inputs=review_input,
                outputs=[summary_output, table_output]
            )

            gr.Examples(
                examples=[
                    "The battery life is terrible but the screen and speakers are amazing for this price.",
                    "The food was tasty, the service was slow, and the ambience was horrible.",
                    "This laptop has a bright screen and a very comfortable keyboard, but the touchpad is awful.",
                    "I bought shoes. They fit well and look great.",
                ],
                inputs=review_input,
                label="Quick examples (click to load)"
            )

        # Right: explanation / about
        with gr.Column(scale=2):
            with gr.Tab("📘 About ABSA"):
                gr.Markdown(
                    """
### What is Aspect-Based Sentiment Analysis?

Traditional sentiment analysis gives you a **single sentiment** for an entire review:

> “The phone is great but the camera is bad.” → *overall: mixed / neutral?*

ABSA goes deeper. It breaks the text into **aspects** and analyzes each one:

- **phone** → positive
- **camera** → negative

In this project, we extend classic ABSA with **contextual opinion mining**:

- We use a fine-tuned **BERT** model to classify sentiment for each *(sentence, aspect)* pair.
- We use **spaCy** to:
  - extract aspect candidates (noun phrases),
  - find opinion words around the aspect,
  - handle contrast markers like <code>but</code>,
  - and pull out short phrases such as “tasty”, “slow”, or “struggles in low light”.

This makes the analysis more interpretable for humans:
you don’t just see “negative”, you see **why** it’s negative.
                    """
                )

            with gr.Tab("ℹ️ How to use"):
                gr.Markdown(
                    """
1. ✍️ **Paste any review** in the text box on the left.
2. ⚙️ Click **“Analyze review”**.
3. 👀 Check:
   - The **summary** to quickly see each aspect and its sentiment.
   - The **table** to inspect aspect, sentence, sentiment, confidence, and opinion phrase.
4. 🔁 Try multiple examples: products, restaurants, apps, services.

You can include multi-sentence reviews with mixed opinions, like:

> "The sweets looked good but they tasted terrible. The packaging was beautiful."

The model will:
- flag **sweets** as negative (tasted terrible),
- flag **packaging** as positive (beautiful).
                    """
                )

demo.launch(share=True, debug=True)


In [None]:
import gradio as gr
import pandas as pd

def analyze_review(review: str):
    review = review.strip()
    if not review:
        return "Please enter a product review.", pd.DataFrame()

    results = absa_pipeline(review)  # uses your existing function

    if not results:
        return "No aspects found in the review.", pd.DataFrame()

    # Build a simple text summary
    lines = []
    for r in results:
        line = f"- Aspect: **{r['aspect']}** → Sentiment: **{r['sentiment']}** (conf={r['confidence']})"
        if r.get("opinion_phrase"):
            line += f" | Opinion phrase: _{r['opinion_phrase']}_"
        lines.append(line)
    summary = "\n".join(lines)

    # Convert to DataFrame for nice table
    df_results = pd.DataFrame(results)
    return summary, df_results


In [None]:
with gr.Blocks() as demo:
    gr.Markdown("# Aspect-Based Sentiment Analysis (ABSA) Demo")
    gr.Markdown(
        "Paste a review below. The model will extract aspects, predict sentiment "
        "for each aspect, and highlight short opinion phrases from the context."
    )

    review_input = gr.Textbox(
        lines=5,
        label="Enter your review",
        placeholder="Example: The battery life is terrible but the screen and speakers are amazing for this price."
    )

    analyze_button = gr.Button("Analyze")

    summary_output = gr.Markdown(label="Analysis Summary")
    table_output = gr.Dataframe(
        headers=["aspect", "sentiment", "confidence", "opinion_phrase"],
        label="Detailed Results"
    )

    analyze_button.click(
        fn=analyze_review,
        inputs=review_input,
        outputs=[summary_output, table_output]
    )
 # share=True gives you a public link for classmates/teacher
demo.launch(share=True, debug=True)


In [None]:
!zip -r /content/absa_bert_model.zip "/content/drive/MyDrive/absa_project/models/absa_bert_model"
