<a href="https://colab.research.google.com/github/seo-won-lee/amazon-absa/blob/main/BARTABSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/DACOS/absa_model/amazon_4_preprocess_lower.csv'

In [None]:
import pandas as pd

df = pd.read_csv(file_path)

In [None]:
pd.set_option('display.max_colwidth', None)   # 또는 0 (버전에 따라 None 또는 0)
df['review']

Unnamed: 0,review
0,good product
1,one month is gone
2,how to buy lights extra
3,best product
4,waste product i received a broken product
...,...
10224,the sandwiches are made pretty quickly
10225,without burning
10226,"बोरोसिल ब्रांड का यह ""सेंडविच मेकर"" देखने में तो अच्छा लगता है मगर इसकी बिल्ड क्वालिटी अच्छी नहीं है। यह लगभग rs 3000 के आसपास आता है। इस प्रकार की बिल्ड क्वालिटी ₹2000 के करीब मिल जाती है तो कोई क्यों ₹1000 अधिक भुगतान करें। पहले मैंने इसे review देखने के बाद ऑर्डर किया था लेकिन जब घर पर डिलीवरी होने के पश्चात unboxing करके देखा तो इसकी बिल्ड क्वालिटी कुछ खास नहीं लगी इसलिए अपने पैसे बचाने के लिए मैंने इसे वापस भेज दिया। मैंने इसकी पैकेजिंग"
10227,recommend work as expected


In [None]:
# 라이브러리 import
import pandas as pd
from transformers import pipeline
# 사전 학습 모델 사용
from datasets import Dataset
# Hugging Face 데이터 구조

dataset = Dataset.from_pandas(df.reset_index(drop=True))
# pandas -> Hugging Face Dataset 변환
# 병렬 처리 및 배치 연산 의도

absa_pipe = pipeline(
    # ABSA 파이프라인
    task='ner',
    # Named Entity Recognition 기반 ABSA 작업 수행
    model='gauneg/roberta-base-absa-ate-sentiment',
    # Hugging Face의 Aspect Term Extraction + Sentiment 모델 사용
    aggregation_strategy='simple',
    # 토큰 단위 예측을 단어 단위로 합침
    device=0
    # GPU 실행
)



def extract_absa(batch):
    try:
        preds_batch = absa_pipe(batch["review"])
        # 배치 리스트 전체를 모델에 넣고 추론
    except Exception as e:
        print("❌ Batch prediction failed:", e)
        preds_batch = [[] for _ in batch["review"]]
        # 텍스트 길이 초과, GPU Out Of Memory 등의 예외 발생 시 에러 메시지 출력
        # 실패한 리뷰들에 대한 리스트 생성

    aspects, sentiments = [], []
    # 각 리뷰별로 추출한 aspect, sentiment, score 저장

    for preds in preds_batch:
        # 리뷰별 결과 처리
        preds = [r for r in preds if r["word"].strip() != "." or len(r["word"].strip()) > 2]
        # 토큰 단위로 ABSA 모델의 결과 r 반환
        # "." 단독 토큰과 길이가 1인 무의미한 결과 제거 ("."이 아니거나 길이 2 이상이면 유지)
        aspects.append([r["word"].strip() for r in preds])
        # aspect
        sentiments.append([r["entity_group"] for r in preds])
        # 긍정/부정/중립 감성 label

    return {
        "aspect": aspects,
        "sentiment": sentiments
        # "score": scores
    }
    # 통합될 딕셔너리 결과



batch_sizes = [64, 48, 32, 16, 8]
# 큰 배치부터 시도하여 Out Of Memory를 피하며 GPU 메모리를 효율적으로 사용하도록 한다
result_dataset = None
# 결과 저장

for bs in batch_sizes:
    # 배치 처리
    # 반복문으로 하나씩 batch_size 값 넣으며 실행
    try:
        print(f"🔍 Trying batch_size={bs}")
        result_dataset = dataset.map(
            # dataset.map을 사용해 배치 단위로 extract_absa 적용
            extract_absa,
            batched=True,
            # 배치 단위 작동
            batch_size=bs,
            desc=f"Running ABSA (batch_size={bs})"
            # 진행률 표시
        )
        print(f"✅ Success with batch_size={bs}")
        # 성공시 루프 종료
        break
    except RuntimeError as e:
        print(f"❌ Failed at batch_size={bs}: {e}")
        # 실패시 다음 작은 배치로 넘어감
        continue

# 모든 batch_size 실패할 경우, 메모리 문제로 판단하고 에러 발생
if result_dataset is None:
    raise RuntimeError("All batch sizes failed due to memory issues.")

# Hugging Face Dataset -> pandas DataFrame
result_df = result_dataset.to_pandas()
# pandas DataFrame에 결과 병합
df[["aspect", "sentiment"]] = result_df[["aspect", "sentiment"]]

# 결과 확인
df


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0


🔍 Trying batch_size=64


Running ABSA (batch_size=64):   0%|          | 0/10229 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


✅ Success with batch_size=64


Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,review,aspect,sentiment
0,B00KRCBA6E,Maharaja Whiteline Lava Neo 1200-Watts Halogen Heater (White and Red),"Home&Kitchen|Heating,Cooling&AirQuality|RoomHeaters","₹2,499","₹5,000",50%,3.8,1889,good product,[product],[pos]
1,B00KRCBA6E,Maharaja Whiteline Lava Neo 1200-Watts Halogen Heater (White and Red),"Home&Kitchen|Heating,Cooling&AirQuality|RoomHeaters","₹2,499","₹5,000",50%,3.8,1889,one month is gone,[],[]
2,B00KRCBA6E,Maharaja Whiteline Lava Neo 1200-Watts Halogen Heater (White and Red),"Home&Kitchen|Heating,Cooling&AirQuality|RoomHeaters","₹2,499","₹5,000",50%,3.8,1889,how to buy lights extra,[lights],[neu]
3,B00KRCBA6E,Maharaja Whiteline Lava Neo 1200-Watts Halogen Heater (White and Red),"Home&Kitchen|Heating,Cooling&AirQuality|RoomHeaters","₹2,499","₹5,000",50%,3.8,1889,best product,[product],[pos]
4,B00KRCBA6E,Maharaja Whiteline Lava Neo 1200-Watts Halogen Heater (White and Red),"Home&Kitchen|Heating,Cooling&AirQuality|RoomHeaters","₹2,499","₹5,000",50%,3.8,1889,waste product i received a broken product,"[product, product]","[neg, neg]"
...,...,...,...,...,...,...,...,...,...,...,...
10224,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (Black),Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAppliances|SandwichMakers,"₹2,863","₹3,690",22%,4.3,6987,the sandwiches are made pretty quickly,[sandwiches],[pos]
10225,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (Black),Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAppliances|SandwichMakers,"₹2,863","₹3,690",22%,4.3,6987,without burning,[],[]
10226,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (Black),Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAppliances|SandwichMakers,"₹2,863","₹3,690",22%,4.3,6987,"बोरोसिल ब्रांड का यह ""सेंडविच मेकर"" देखने में तो अच्छा लगता है मगर इसकी बिल्ड क्वालिटी अच्छी नहीं है। यह लगभग rs 3000 के आसपास आता है। इस प्रकार की बिल्ड क्वालिटी ₹2000 के करीब मिल जाती है तो कोई क्यों ₹1000 अधिक भुगतान करें। पहले मैंने इसे review देखने के बाद ऑर्डर किया था लेकिन जब घर पर डिलीवरी होने के पश्चात unboxing करके देखा तो इसकी बिल्ड क्वालिटी कुछ खास नहीं लगी इसलिए अपने पैसे बचाने के लिए मैंने इसे वापस भेज दिया। मैंने इसकी पैकेजिंग","[�, �, �, �, �, �, �, �, �, �, �, �, �, �, �, �, �, �]","[neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu, neu]"
10227,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (Black),Home&Kitchen|Kitchen&HomeAppliances|SmallKitchenAppliances|SandwichMakers,"₹2,863","₹3,690",22%,4.3,6987,recommend work as expected,[],[]


In [None]:
df.to_csv('/content/drive/MyDrive/DACOS/absa_model/amazon_4_preprocess_lower_absa.csv', index=False, encoding='utf-8')