In [None]:
import pandas as pd
import re
import time
import json
import google.generativeai as genai
from collections import Counter
from tqdm import tqdm
from dotenv import load_dotenv
import os

In [33]:
# ==========================================
# 1. KONFIGURASI
# ==========================================
load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
BATCH_SIZE = 50       # Jumlah kata per request (50-100 aman)
TOP_N_WORDS = 1000     # Ambil 500 kata teratas untuk diperbaiki (Cover ~60-70% data)

# Setup Gemini
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash') # Pakai Flash biar cepat & murah

In [34]:
# ==========================================
# 2. PERSIAPAN DATA (CANDIDATE MINING)
# ==========================================
def get_slang_candidates(csv_file):
    print("üìÇ Membaca data...")
    df = pd.read_csv(csv_file)
    all_text = " ".join(df['Teks_Komentar'].astype(str).tolist()).lower()
    
    # Ambil kata (huruf a-z, min 2 karakter)
    words = re.findall(r'\b[a-z]{2,}\b', all_text)
    
    # Hitung frekuensi
    counter = Counter(words)
    
    # Ambil Top-N kata paling sering muncul
    # (Kita asumsikan typo yang jarang muncul tidak signifikan buat model)
    candidates = [w for w, c in counter.most_common(TOP_N_WORDS)]
    
    print(f"‚úÖ Menemukan {len(counter)} kata unik.")
    print(f"üéØ Fokus pada {TOP_N_WORDS} kata paling dominan.")
    return candidates

In [35]:
# ==========================================
# 3. FUNGSI REQUEST KE GEMINI
# ==========================================
def process_batch_with_gemini(word_list):
    prompt = f"""
    Tugasmu adalah membuat dictionary normalisasi teks bahasa Indonesia (Slang ke Baku).
    
    Input:
    {', '.join(word_list)}
    
    Instruksi:
    1. Cek setiap kata di input.
    2. Jika kata adalah SLANG/TYPO/SINGKATAN (contoh: "yg"->"yang", "gak"->"tidak", "bgt"->"banget", "judol"->"judi online"), buat mappingnya.
    3. Jika kata sudah BAKU (contoh: "saya", "makan") atau NAMA ORANG/TEMPAT, JANGAN dimasukkan ke output.
    4. Fokus context: Komentar YouTube Indonesia (banyak bahasa gaul).
    
    Format Output WAJIB JSON murni tanpa markdown:
    {{
        "kata_slang": "kata_baku",
        "slang_lain": "baku_lain"
    }}
    """
    
    try:
        response = model.generate_content(prompt)
        text_response = response.text.strip()
        
        # Bersihkan format markdown jika ada (```json ... ```)
        text_response = text_response.replace("```json", "").replace("```", "")
        
        return json.loads(text_response)
    except Exception as e:
        print(f"‚ö†Ô∏è Error batch: {e}")
        return {}

In [37]:
# ==========================================
# 4. EKSEKUSI UTAMA (BATCHING LOOP)
# ==========================================
# Load kandidat kata
candidates = get_slang_candidates('raw-scrape-yt.csv')

# Pecah jadi chunks (batch)
batches = [candidates[i:i + BATCH_SIZE] for i in range(0, len(candidates), BATCH_SIZE)]

final_dictionary = {}

print("\nüöÄ Memulai proses normalisasi dengan Gemini...")
print(f"üì¶ Total Batch: {len(batches)}")

# Loop dengan progress bar
for i, batch in enumerate(tqdm(batches)):
    # Request ke Gemini
    batch_result = process_batch_with_gemini(batch)
    
    # Gabungkan hasil
    if batch_result:
        final_dictionary.update(batch_result)
    
    # Jeda sopan agar tidak kena rate limit
    time.sleep(2)

üìÇ Membaca data...
‚úÖ Menemukan 8339 kata unik.
üéØ Fokus pada 1000 kata paling dominan.

üöÄ Memulai proses normalisasi dengan Gemini...
üì¶ Total Batch: 20


  0%|          | 0/20 [00:00<?, ?it/s]

‚ö†Ô∏è Error batch: 403 Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked. [reason: "API_KEY_SERVICE_BLOCKED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "methodName"
  value: "google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent"
}
metadata {
  key: "consumer"
  value: "projects/12114063549"
}
metadata {
  key: "apiName"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked."
]


  5%|‚ñå         | 1/20 [00:02<00:41,  2.17s/it]

‚ö†Ô∏è Error batch: 403 Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked. [reason: "API_KEY_SERVICE_BLOCKED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "methodName"
  value: "google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent"
}
metadata {
  key: "consumer"
  value: "projects/12114063549"
}
metadata {
  key: "apiName"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked."
]


 10%|‚ñà         | 2/20 [00:04<00:37,  2.10s/it]

‚ö†Ô∏è Error batch: 403 Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked. [reason: "API_KEY_SERVICE_BLOCKED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "methodName"
  value: "google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent"
}
metadata {
  key: "consumer"
  value: "projects/12114063549"
}
metadata {
  key: "apiName"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked."
]


 15%|‚ñà‚ñå        | 3/20 [00:06<00:35,  2.07s/it]

‚ö†Ô∏è Error batch: 403 Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked. [reason: "API_KEY_SERVICE_BLOCKED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "methodName"
  value: "google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent"
}
metadata {
  key: "consumer"
  value: "projects/12114063549"
}
metadata {
  key: "apiName"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "Requests to this API generativelanguage.googleapis.com method google.ai.generativelanguage.v1beta.GenerativeService.GenerateContent are blocked."
]


 15%|‚ñà‚ñå        | 3/20 [00:08<00:47,  2.77s/it]


KeyboardInterrupt: 

In [None]:
# ==========================================
# 5. SIMPAN HASIL
# ==========================================
print(f"\n‚úÖ Selesai! Berhasil menormalisasi {len(final_dictionary)} kata.")

# Simpan ke JSON file biar aman
with open('kamus_normalisasi_gemini.json', 'w') as f:
    json.dump(final_dictionary, f, indent=4)

# Preview Dictionary Python
print("\n--- COPY DICTIONARY INI KE KODE PIPELINE KAMU ---")
print(final_dictionary)