In [17]:
import os
import pandas as pd
from datetime import datetime
from deep_translator import GoogleTranslator
from transformers import pipeline
from pyabsa import AspectTermExtraction as ATEPC


In [18]:
# ===== Konfigurasi =====
# Path input CSV (wajib memiliki kolom 'review')
input_csv = 'data_clean/all_reviews_merged.csv'  # ganti sesuai lokasi file Anda

# Folder output dan nama file hasil
output_dir = 'dataset'
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_csv = os.path.join(output_dir, f'sentiment_results_{timestamp}.csv')

# Batasi jumlah baris untuk uji cepat (None = semua)
limit_rows = 100  # set None untuk semua baris


In [19]:
# ===== Load data =====
df = pd.read_csv(input_csv)
if limit_rows is not None:
    df = df.head(limit_rows).copy()

if 'review' not in df.columns:
    raise ValueError("Kolom 'review' tidak ditemukan pada CSV input")

df['review'] = df['review'].fillna('').astype(str)
df.head(3)


Unnamed: 0,username,review
0,Ela Nuraini,pertama kali nyobain bakso kondusif tekstur ba...
1,Verdinand Tampubolon,saat searching pilihan makanan bakso di maps w...
2,Celeste Evans,salah satu bakso yang enak di banyuwangi kota....


In [20]:
# ===== Terjemahan ID -> EN =====
translator = GoogleTranslator(source='id', target='en')

def translate_safe(text):
    text = (text or '').strip()
    if not text:
        return ''
    try:
        return translator.translate(text)
    except Exception:
        # Retry pendek atau fallback ke teks asli jika gagal
        try:
            return translator.translate(text)
        except Exception:
            return text

df['review_en'] = df['review'].apply(translate_safe)
df[['review', 'review_en']].head(3)


Unnamed: 0,review,review_en
0,pertama kali nyobain bakso kondusif tekstur ba...,"The first time I tried conducive meatballs, th..."
1,saat searching pilihan makanan bakso di maps w...,When searching for meatball food options on th...
2,salah satu bakso yang enak di banyuwangi kota....,one of the delicious meatballs in Banyuwangi c...


In [21]:
# ===== Sentiment Analysis (English) =====
clf = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def score_and_label(text):
    text = (text or '').strip()
    if not text:
        return 0.0, 'Neutral'
    out = clf(text)[0]
    label = out['label']  # 'POSITIVE' atau 'NEGATIVE'
    prob = float(out['score'])
    # Jadikan skor signed: positif = +prob, negatif = -prob
    signed = prob if label.upper().startswith('POS') else -prob
    final_label = 'Positive' if signed > 0 else 'Negative'
    return signed, final_label

scores_labels = df['review_en'].apply(score_and_label)
df['sentiment_score'] = scores_labels.apply(lambda x: x[0])
df['sentiment_label'] = scores_labels.apply(lambda x: x[1])
df[['review_en', 'sentiment_score', 'sentiment_label']].head(5)


Device set to use cpu


Unnamed: 0,review_en,sentiment_score,sentiment_label
0,"The first time I tried conducive meatballs, th...",-0.990115,Negative
1,When searching for meatball food options on th...,-0.999443,Negative
2,one of the delicious meatballs in Banyuwangi c...,0.999334,Positive
3,"Delicious meatballs, good service but not enou...",0.998838,Positive
4,The spicy sauce makes you miss it. Here it's h...,0.992673,Positive


In [22]:
aspect_extractor = ATEPC.AspectExtractor('multilingual', auto_device=True, cal_perplexity=True)

def _absa_auto(text):
    text = (text or '').strip()
    if not text:
        return {'aspects': [], 'sentiments': []}
    res = aspect_extractor.predict([text], print_result=False, save_result=False, ignore_error=True, pred_sentiment=True)
    item = res[0] if isinstance(res, list) else res
    aspects = item.get('aspect', []) or item.get('aspects', [])
    sentiments = item.get('sentiment', []) or item.get('sentiments', [])
    return {'aspects': aspects, 'sentiments': sentiments}

df['absa_auto'] = df['review_en'].apply(_absa_auto)
df['absa_auto_aspects'] = df['absa_auto'].apply(lambda d: ', '.join(d.get('aspects', [])))
df['absa_auto_sentiments'] = df['absa_auto'].apply(lambda d: ', '.join(d.get('sentiments', [])))
df['absa_sentiment_score'] = df['absa_auto'].apply(lambda d: sum(d.get('sentiment', [])))


[2025-10-30 15:49:39] (2.4.2) ********** Available ATEPC model checkpoints for Version:2.4.2 (this version) **********
[2025-10-30 15:49:39] (2.4.2) ********** Available ATEPC model checkpoints for Version:2.4.2 (this version) **********
[2025-10-30 15:49:39] (2.4.2) Downloading checkpoint:multilingual 
[2025-10-30 15:49:39] (2.4.2) Notice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets
[2025-10-30 15:49:39] (2.4.2) Checkpoint already downloaded, skip
[2025-10-30 15:49:39] (2.4.2) Load aspect extractor from checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT
[2025-10-30 15:49:39] (2.4.2) config: checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT\fast_lcf_atepc.config
[2025-10-30 15:49:39] (2.4.2) state_dict: checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT\fast_lcf_atepc.state_dict
[2025-10-30 15:49:39] (2.4.2) model: None
[2025-10-30 15:49:39] (2.4.2) tokenizer: checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT\fast_lcf_atepc.tokenizer
[2025-10-30 15:49:40] 



In [23]:
# ===== Simpan hasil =====
df.to_csv(output_csv, index=False, encoding='utf-8-sig')
output_csv


'dataset\\sentiment_results_20251030_154827.csv'