In [3]:
import time
import requests
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from colorthief import ColorThief
import math
import re

# PERBAIKAN: Dictionary warna CSS3 yang lebih lengkap dan reliable
CSS3_COLORS = {
    'aliceblue': (240, 248, 255),
    'antiquewhite': (250, 235, 215),
    'aqua': (0, 255, 255),
    'aquamarine': (127, 255, 212),
    'azure': (240, 255, 255),
    'beige': (245, 245, 220),
    'bisque': (255, 228, 196),
    'black': (0, 0, 0),
    'blanchedalmond': (255, 235, 205),
    'blue': (0, 0, 255),
    'blueviolet': (138, 43, 226),
    'brown': (165, 42, 42),
    'burlywood': (222, 184, 135),
    'cadetblue': (95, 158, 160),
    'chartreuse': (127, 255, 0),
    'chocolate': (210, 105, 30),
    'coral': (255, 127, 80),
    'cornflowerblue': (100, 149, 237),
    'cornsilk': (255, 248, 220),
    'crimson': (220, 20, 60),
    'cyan': (0, 255, 255),
    'darkblue': (0, 0, 139),
    'darkcyan': (0, 139, 139),
    'darkgoldenrod': (184, 134, 11),
    'darkgray': (169, 169, 169),
    'darkgreen': (0, 100, 0),
    'darkgrey': (169, 169, 169),
    'darkkhaki': (189, 183, 107),
    'darkmagenta': (139, 0, 139),
    'darkolivegreen': (85, 107, 47),
    'darkorange': (255, 140, 0),
    'darkorchid': (153, 50, 204),
    'darkred': (139, 0, 0),
    'darksalmon': (233, 150, 122),
    'darkseagreen': (143, 188, 143),
    'darkslateblue': (72, 61, 139),
    'darkslategray': (47, 79, 79),
    'darkslategrey': (47, 79, 79),
    'darkturquoise': (0, 206, 209),
    'darkviolet': (148, 0, 211),
    'deeppink': (255, 20, 147),
    'deepskyblue': (0, 191, 255),
    'dimgray': (105, 105, 105),
    'dimgrey': (105, 105, 105),
    'dodgerblue': (30, 144, 255),
    'firebrick': (178, 34, 34),
    'floralwhite': (255, 250, 240),
    'forestgreen': (34, 139, 34),
    'fuchsia': (255, 0, 255),
    'gainsboro': (220, 220, 220),
    'ghostwhite': (248, 248, 255),
    'gold': (255, 215, 0),
    'goldenrod': (218, 165, 32),
    'gray': (128, 128, 128),
    'green': (0, 128, 0),
    'greenyellow': (173, 255, 47),
    'grey': (128, 128, 128),
    'honeydew': (240, 255, 240),
    'hotpink': (255, 105, 180),
    'indianred': (205, 92, 92),
    'indigo': (75, 0, 130),
    'ivory': (255, 255, 240),
    'khaki': (240, 230, 140),
    'lavender': (230, 230, 250),
    'lavenderblush': (255, 240, 245),
    'lawngreen': (124, 252, 0),
    'lemonchiffon': (255, 250, 205),
    'lightblue': (173, 216, 230),
    'lightcoral': (240, 128, 128),
    'lightcyan': (224, 255, 255),
    'lightgoldenrodyellow': (250, 250, 210),
    'lightgray': (211, 211, 211),
    'lightgreen': (144, 238, 144),
    'lightgrey': (211, 211, 211),
    'lightpink': (255, 182, 193),
    'lightsalmon': (255, 160, 122),
    'lightseagreen': (32, 178, 170),
    'lightskyblue': (135, 206, 250),
    'lightslategray': (119, 136, 153),
    'lightslategrey': (119, 136, 153),
    'lightsteelblue': (176, 196, 222),
    'lightyellow': (255, 255, 224),
    'lime': (0, 255, 0),
    'limegreen': (50, 205, 50),
    'linen': (250, 240, 230),
    'magenta': (255, 0, 255),
    'maroon': (128, 0, 0),
    'mediumaquamarine': (102, 205, 170),
    'mediumblue': (0, 0, 205),
    'mediumorchid': (186, 85, 211),
    'mediumpurple': (147, 112, 219),
    'mediumseagreen': (60, 179, 113),
    'mediumslateblue': (123, 104, 238),
    'mediumspringgreen': (0, 250, 154),
    'mediumturquoise': (72, 209, 204),
    'mediumvioletred': (199, 21, 133),
    'midnightblue': (25, 25, 112),
    'mintcream': (245, 255, 250),
    'mistyrose': (255, 228, 225),
    'moccasin': (255, 228, 181),
    'navajowhite': (255, 222, 173),
    'navy': (0, 0, 128),
    'oldlace': (253, 245, 230),
    'olive': (128, 128, 0),
    'olivedrab': (107, 142, 35),
    'orange': (255, 165, 0),
    'orangered': (255, 69, 0),
    'orchid': (218, 112, 214),
    'palegoldenrod': (238, 232, 170),
    'palegreen': (152, 251, 152),
    'paleturquoise': (175, 238, 238),
    'palevioletred': (219, 112, 147),
    'papayawhip': (255, 239, 213),
    'peachpuff': (255, 218, 185),
    'peru': (205, 133, 63),
    'pink': (255, 192, 203),
    'plum': (221, 160, 221),
    'powderblue': (176, 224, 230),
    'purple': (128, 0, 128),
    'red': (255, 0, 0),
    'rosybrown': (188, 143, 143),
    'royalblue': (65, 105, 225),
    'saddlebrown': (139, 69, 19),
    'salmon': (250, 128, 114),
    'sandybrown': (244, 164, 96),
    'seagreen': (46, 139, 87),
    'seashell': (255, 245, 238),
    'sienna': (160, 82, 45),
    'silver': (192, 192, 192),
    'skyblue': (135, 206, 235),
    'slateblue': (106, 90, 205),
    'slategray': (112, 128, 144),
    'slategrey': (112, 128, 144),
    'snow': (255, 250, 250),
    'springgreen': (0, 255, 127),
    'steelblue': (70, 130, 180),
    'tan': (210, 180, 140),
    'teal': (0, 128, 128),
    'thistle': (216, 191, 216),
    'tomato': (255, 99, 71),
    'turquoise': (64, 224, 208),
    'violet': (238, 130, 238),
    'wheat': (245, 222, 179),
    'white': (255, 255, 255),
    'whitesmoke': (245, 245, 245),
    'yellow': (255, 255, 0),
    'yellowgreen': (154, 205, 50)
}

def closest_color(requested_color):
    """
    PERBAIKAN: Fungsi yang lebih robust untuk mencari warna terdekat
    menggunakan Euclidean distance dalam ruang RGB
    """
    try:
        min_distance = float('inf')
        closest_name = 'unknown'
        
        r1, g1, b1 = requested_color
        
        for color_name, (r2, g2, b2) in CSS3_COLORS.items():
            # Hitung Euclidean distance
            distance = math.sqrt((r1 - r2)**2 + (g1 - g2)**2 + (b1 - b2)**2)
            
            if distance < min_distance:
                min_distance = distance
                closest_name = color_name
        
        return closest_name
    except Exception as e:
        print(f"❌ Error dalam closest_color: {e}")
        return 'unknown'

def get_color_name(rgb_color):
    """
    PERBAIKAN: Error handling yang lebih spesifik dan debugging info
    """
    try:
        # Validasi input
        if not isinstance(rgb_color, (tuple, list)) or len(rgb_color) != 3:
            print(f"❌ Format RGB tidak valid: {rgb_color}")
            return 'unknown'
        
        # Validasi range RGB (0-255)
        r, g, b = rgb_color
        if not all(0 <= val <= 255 for val in [r, g, b]):
            print(f"❌ Nilai RGB di luar range (0-255): {rgb_color}")
            return 'unknown'
        
        print(f"🎨 Mencari nama untuk warna RGB: {rgb_color}")
        color_name = closest_color(rgb_color)
        print(f"✅ Warna ditemukan: {color_name}")
        return color_name
        
    except Exception as e:
        print(f"❌ Error dalam get_color_name: {e}")
        return 'unknown'

def extract_hashtags_and_clean_caption(caption_text):
    """
    Fungsi untuk memisahkan hashtag dari caption
    Returns: (clean_caption, hashtags_string)
    """
    try:
        if not caption_text:
            return '', ''
        
        # Mencari semua hashtag dengan regex
        hashtags = re.findall(r'#\w+', caption_text)
        
        # Membuat string hashtag yang dipisahkan koma
        hashtags_string = ', '.join(hashtags) if hashtags else ''
        
        # Menghapus hashtag dari caption untuk mendapatkan caption bersih
        clean_caption = re.sub(r'#\w+', '', caption_text)
        
        # Membersihkan spasi berlebih
        clean_caption = ' '.join(clean_caption.split())
        
        return clean_caption.strip(), hashtags_string
        
    except Exception as e:
        print(f"❌ Error dalam extract_hashtags_and_clean_caption: {e}")
        return caption_text, ''

# === Setup Chrome ===
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# === Login manual ===
driver.get('https://www.instagram.com/')
print("🔑 Silakan login ke akun Instagram Anda di browser yang muncul")
time.sleep(30)

# === Profil target ===
username_target = 'danarhadi_id'
profile_url = f'https://www.instagram.com/{username_target}/'
driver.get(profile_url)
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "section.xc3tme8")))
time.sleep(5)


# === PENGAMBILAN DATA PROFIL LENGKAP ===
print("\n🔍 Mengambil data detail profil...")

# -- REVISI: Username & Nama Pengguna --
username = username_target  # Username sudah diketahui
display_name = 'N/A'
try:
    # Ambil elemen `span` dari child pertama `section.xc3tme8`
    display_name_element = driver.find_element(By.CSS_SELECTOR, "section.xc3tme8 > div:nth-child(1) span")
    display_name = display_name_element.text.strip()
except Exception as e:
    print(f"⚠️ Gagal mengambil Nama Pengguna: {e}")

print(f"✅ Username: {username}")
print(f"✅ Nama Pengguna: {display_name}")


# -- Kategori Toko, Bio, Tautan --
kategori_profil = 'N/A'
bio = 'N/A'
tautan = 'N/A'
try:
    kategori_profil_element = driver.find_element(By.CSS_SELECTOR, "div._ap3a._aaco._aacu._aacy._aad6._aade")
    kategori_profil = kategori_profil_element.text
except: print("⚠️ Kategori toko tidak ditemukan.")
print(f"✅ Kategori Profil: {kategori_profil}")

bio = 'N/A'  # Default

try:
    bio_candidates = driver.find_elements(By.CSS_SELECTOR, "span._ap3a")
    
    for candidate in bio_candidates:
        text = candidate.text.strip()
        # Cek apakah kemungkinan besar itu bio
        if '@' in text or 'WA:' in text or 'Shopee' in text or len(text) > 20:
            bio = text.replace('\n', ' ')
            break

except Exception as e:
    print("⚠️ Bio tidak ditemukan:", str(e))

print(f"✅ Bio: {bio}")


try:
    tautan_element = driver.find_element(By.CSS_SELECTOR, "a[href*='l.instagram.com']")
    tautan = tautan_element.text
except: print("⚠️ Tautan tidak ditemukan.")
print(f"✅ Tautan: {tautan}")

# -- Statistik (Posts, Followers, Following) --
total_posts, total_followers, total_following = 'N/A', 'N/A', 'N/A'
try:
    stats_elements = driver.find_elements(By.CSS_SELECTOR, "ul.x78zum5 > li.xl565be")
    for stat in stats_elements:
        text = stat.text
        if 'posts' in text: total_posts = text.split(' ')[0]
        elif 'followers' in text:
            try: total_followers = stat.find_element(By.CSS_SELECTOR, "span[title]").get_attribute('title').replace(',', '')
            except: total_followers = text.split(' ')[0]
        elif 'following' in text: total_following = text.split(' ')[0]
except Exception as e: print(f"⚠️ Gagal mengambil statistik profil: {e}")
print(f"📊 Statistik Profil - Posts: {total_posts}, Followers: {total_followers}, Following: {total_following}")


# === Scroll dan ambil post secara bertahap ===
print("\n⏳ Mulai scroll dan ambil post...")
scroll_times = 10
post_links = []
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_attempts = 0
max_scroll_attempts = 10

for i in range(scroll_times):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)
    posts = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"], a[href*="/reel/"]')
    for post in posts:
        href = post.get_attribute('href')
        if href and href not in post_links: post_links.append(href)
        if len(post_links) >= 55: break
    print(f"   Scroll ke-{i+1}: total post sekarang {len(post_links)}")
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        print("   Telah mencapai bagian bawah halaman.")
        break
    last_height = new_height

print(f"\n✅ Total post ditemukan: {len(post_links)}")
post_links = post_links[:55] 
print(f"\nℹ️  Akan memproses {len(post_links)} post teratas.")

# === DATA OUTPUT ===
all_data = []

# === Loop setiap post ===
for idx, link in enumerate(post_links):
    print(f"\n--- Memproses Post {idx+1}/{len(post_links)}: {link} ---")
    driver.get(link)
    try: WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
    except TimeoutException:
        print("❌ Halaman post tidak termuat, melewati post ini.")
        continue
    time.sleep(5)

    id_post = link.split('/')[-2]
    media_type = 'reel' if '/reel/' in link else 'post'
    
    # === Memuat Komentar (Logika yang Anda inginkan dipertahankan) ===
    print("⏳ Memuat semua komentar...")

    while True:
        try:
            # XPath fleksibel untuk semua bahasa
            xpath = "//button[.//*[contains(@aria-label, 'Load more comments') or contains(@aria-label, 'Muat komentar lainnya')]]"
            
            # Temukan tombol
            load_more = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, xpath))
            )

            # Scroll agar bisa diklik
            driver.execute_script("arguments[0].scrollIntoView();", load_more)
            time.sleep(0.5)

            # Tunggu tombol bisa diklik
            WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, xpath))
            )

            # Klik via JavaScript
            driver.execute_script("arguments[0].click();", load_more)
            print("✅ Tombol komentar diklik.")
            time.sleep(5)
        except Exception as e:
            print(f"⛔ Tidak ada tombol komentar lagi atau error: {e}")
            break


    # === PENGAMBILAN DATA POST ===
    caption, likes, media_url, upload_time = '', '0', '', ''
    try: caption = driver.find_element(By.TAG_NAME, 'h1').text.strip()
    except: pass
    try:
        like_element = driver.find_element(By.CSS_SELECTOR, "a[href*='/liked_by/'] span")
        likes = like_element.text.replace(' likes', '').replace(',', '').strip()
    except: pass
    try:
        video_elements = driver.find_elements(By.TAG_NAME, 'video')
        if video_elements: media_url = video_elements[0].get_attribute('src')
        else: media_url = driver.find_element(By.CSS_SELECTOR, "img.x5yr21d").get_attribute('src')
    except: pass
    try: upload_time = driver.find_element(By.TAG_NAME, 'time').get_attribute('datetime')
    except: pass
    content_category = 'danarhadi_id post'

    # === PEMISAHAN HASHTAG DARI CAPTION ===
    clean_caption, hashtags = extract_hashtags_and_clean_caption(caption)
    print(f"📝 Caption bersih: {clean_caption[:50]}...")
    print(f"🏷️ Hashtags: {hashtags}")

    # === Mengambil semua komentar langsung dengan Selenium ===
    comments_list = []
    print("🕵️  Mengambil teks dari semua komentar yang terlihat...")
    try:
        comment_elements = driver.find_elements(By.CSS_SELECTOR, "ul._a9ym span._aade")
        for element in comment_elements: comments_list.append(element.text.strip())
    except: print("❌ Gagal mengambil komentar dengan Selenium.")
    final_comments = comments_list[:50]
    comments_count_scraped = len(final_comments)
    print(f"✅ Berhasil mengambil {comments_count_scraped} komentar.")
    
    # --- PERBAIKAN: Analisis warna dengan debugging yang lebih baik ---
    dominant_color = ''
    color_name = ''
    if media_url and media_type == 'post':
        try:
            print(f"🌐 Mengunduh gambar dari: {media_url}")
            response = requests.get(media_url, timeout=10)
            response.raise_for_status()  # Raise exception untuk HTTP errors
            
            img = BytesIO(response.content)
            ct = ColorThief(img)
            dominant_color_rgb = ct.get_color(quality=1)
            dominant_color = str(dominant_color_rgb)
            
            print(f"🎨 Warna dominan RGB: {dominant_color_rgb}")
            color_name = get_color_name(dominant_color_rgb)
            print(f"✅ Nama warna: {color_name}")
            
        except requests.RequestException as e:
            print(f"❌ Error mengunduh gambar: {str(e)}")
            dominant_color = 'download_error'
            color_name = 'download_error'
        except Exception as e:
            print(f"❌ Error analisis warna: {str(e)}")
            dominant_color = 'analysis_error'
            color_name = 'analysis_error'
    else:
        print("⏭️ Melewati analisis warna (bukan post gambar atau tidak ada URL)")

    # --- Menyimpan data ke list utama (dengan kolom hashtag baru) ---
    data_profil = [username, display_name, kategori_profil, bio, tautan, total_posts, total_followers, total_following]
    if not final_comments:
        all_data.append(data_profil + [
            id_post, link, clean_caption, hashtags, likes, 0, media_url, media_type,
            dominant_color, color_name, content_category, upload_time, ''
        ])
    else:
        for comment_text in final_comments:
            all_data.append(data_profil + [
                id_post, link, clean_caption, hashtags, likes, comments_count_scraped, media_url, media_type,
                dominant_color, color_name, content_category, upload_time, comment_text
            ])
    time.sleep(1)

# === Save ke Excel (dengan kolom hashtag baru) ===
df = pd.DataFrame(all_data, columns=[
    'username', 'nama_pengguna', 'kategori_profil', 'bio', 'tautan', 
    'total_posts', 'total_followers', 'total_following',
    'id_post', 'url_post', 'caption', 'hashtags', 'likes', 'comments_count',
    'media_url', 'media_type', 'dominant_color', 'color_name',
    'content_category', 'upload_time', 'comment'
])

output_filename = f"hasil_scrape_{username_target}.xlsx"
df.to_excel(output_filename, index=False)
print(f"\n✅ Selesai! Data disimpan ke '{output_filename}'")

driver.quit()

🔑 Silakan login ke akun Instagram Anda di browser yang muncul

🔍 Mengambil data detail profil...
✅ Username: danarhadi_id
✅ Nama Pengguna: Batik Danar Hadi, Solo
✅ Kategori Profil: Clothing (Brand)
✅ Bio: Akun Resmi Batik Danar Hadi #DanarHadi57Anniversary Merangkai Keindahan Motif Untuk Semua Generasi 🇮🇩 Klik👇🏻 & nikmati...  more
✅ Tautan: linktr.ee/danarhadi_id
📊 Statistik Profil - Posts: 3,261, Followers: 87686, Following: 287

⏳ Mulai scroll dan ambil post...
   Scroll ke-1: total post sekarang 12
   Scroll ke-2: total post sekarang 24
   Scroll ke-3: total post sekarang 36
   Scroll ke-4: total post sekarang 48
   Scroll ke-5: total post sekarang 48
   Scroll ke-6: total post sekarang 55
   Scroll ke-7: total post sekarang 55
   Scroll ke-8: total post sekarang 55
   Scroll ke-9: total post sekarang 55
   Scroll ke-10: total post sekarang 55

✅ Total post ditemukan: 55

ℹ️  Akan memproses 55 post teratas.

--- Memproses Post 1/55: https://www.instagram.com/danarhadi_id/p/DMrtdwkRR