In [None]:
import time
import requests
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from colorthief import ColorThief
import webcolors
import re

# === Fungsi helper: RGB ke nama warna ===
def closest_color(requested_color):
    min_colors = {}
    for key, name in webcolors.CSS3_HEX_TO_NAMES.items():
        r_c, g_c, b_c = webcolors.hex_to_rgb(key)
        rd = (r_c - requested_color[0]) ** 2
        gd = (g_c - requested_color[1]) ** 2
        bd = (b_c - requested_color[2]) ** 2
        min_colors[(rd + gd + bd)] = name
    return min_colors[min(min_colors.keys())]

def get_color_name(rgb_color):
    try:
        return closest_color(rgb_color)
    except Exception:
        return 'unknown'

# === Fungsi untuk mengambil comments ===
def get_comments_data(driver, soup):
    comments_data = []
    comments_count = 0
    
    # Metode 1: Cari berdasarkan struktur HTML yang lebih stabil
    try:
        # Cari semua span yang berisi teks comment
        comment_spans = soup.find_all('span', dir='auto')
        
        for span in comment_spans:
            # Skip jika span berisi username atau caption
            parent = span.parent
            if parent and parent.name == 'h1':  # Skip caption
                continue
                
            text = span.get_text(strip=True)
            if text and len(text) > 1:  # Filter teks yang valid
                # Cek apakah ini bukan username (biasanya ada @ atau link)
                if not text.startswith('@') and 'href' not in str(span):
                    comments_data.append(text)
                    comments_count += 1
    except Exception as e:
        print(f"Error metode 1: {e}")
    
    # Metode 2: Cari dengan XPath yang lebih fleksibel
    try:
        comment_elements = driver.find_elements(By.XPATH, "//div[@role='button']//span[contains(@dir, 'auto')]")
        
        for element in comment_elements:
            text = element.text.strip()
            if text and len(text) > 1 and text not in comments_data:
                # Filter yang bukan username atau action
                if not any(keyword in text.lower() for keyword in ['like', 'reply', 'follow', 'view']):
                    comments_data.append(text)
                    comments_count += 1
    except Exception as e:
        print(f"Error metode 2: {e}")
    
    # Metode 3: Cari berdasarkan aria-label
    try:
        comment_sections = soup.find_all('div', {'aria-label': re.compile(r'comment', re.I)})
        for section in comment_sections:
            spans = section.find_all('span')
            for span in spans:
                text = span.get_text(strip=True)
                if text and len(text) > 1 and text not in comments_data:
                    comments_data.append(text)
                    comments_count += 1
    except Exception as e:
        print(f"Error metode 3: {e}")
    
    # Metode 4: Cari dengan pattern yang lebih umum
    try:
        # Cari elemen yang kemungkinan berisi comment
        potential_comments = soup.find_all('span', string=re.compile(r'.{2,}'))
        
        for span in potential_comments:
            text = span.get_text(strip=True)
            parent_class = span.parent.get('class', []) if span.parent else []
            
            # Filter berdasarkan konteks
            if (text and len(text) > 1 and 
                text not in comments_data and
                not text.startswith('@') and
                not any(keyword in text.lower() for keyword in ['like', 'reply', 'follow', 'view', 'ago', 'hour', 'day', 'week'])):
                comments_data.append(text)
                comments_count += 1
    except Exception as e:
        print(f"Error metode 4: {e}")
    
    return comments_data, comments_count

# === Fungsi untuk mengambil likes dengan lebih akurat ===
def get_likes_count(soup):
    try:
        # Metode 1: Cari berdasarkan pattern "X likes"
        likes_pattern = soup.find('a', string=re.compile(r'\d+.*like', re.I))
        if likes_pattern:
            likes_text = likes_pattern.get_text()
            numbers = re.findall(r'[\d,]+', likes_text)
            return numbers[0] if numbers else ''
        
        # Metode 2: Cari berdasarkan struktur button
        likes_button = soup.find('button', {'aria-label': re.compile(r'like', re.I)})
        if likes_button:
            likes_text = likes_button.get_text()
            numbers = re.findall(r'[\d,]+', likes_text)
            return numbers[0] if numbers else ''
        
        # Metode 3: Cari span yang berisi angka + "likes"
        all_spans = soup.find_all('span')
        for span in all_spans:
            text = span.get_text().lower()
            if 'like' in text and any(char.isdigit() for char in text):
                numbers = re.findall(r'[\d,]+', text)
                return numbers[0] if numbers else ''
                
    except Exception as e:
        print(f"Error getting likes: {e}")
    
    return ''

# === Setup Chrome dengan stabilitas yang lebih baik ===
def create_driver():
    chrome_options = Options()
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-web-security")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-plugins")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Prefs untuk menghemat memory
    prefs = {
        "profile.default_content_setting_values": {
            "notifications": 2,
            "media_stream": 2,
        }
    }
    chrome_options.add_experimental_option("prefs", prefs)
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

# === Fungsi untuk restart driver jika error ===
def restart_driver_if_needed(driver):
    try:
        driver.current_url  # Test jika driver masih hidup
        return driver
    except:
        print("🔄 Driver session bermasalah, restart driver...")
        try:
            driver.quit()
        except:
            pass
        return create_driver()

driver = create_driver()

# === Login manual ===
driver.get('https://www.instagram.com/')
print("🔑 Silakan login manual dulu...")
time.sleep(40)

# === Profil target ===
username_target = 'batikkultur'
profile_url = f'https://www.instagram.com/{username_target}/'
driver.get(profile_url)
time.sleep(5)

# === Ambil kategori akun dari profil ===
profile_soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
    # Cari dengan berbagai metode
    kategori_akun = ''
    
    # Metode 1: Cari berdasarkan struktur profile
    category_divs = profile_soup.find_all('div', string=re.compile(r'Business|Shopping|Brand|Store|Art|Culture', re.I))
    if category_divs:
        kategori_akun = category_divs[0].get_text().strip()
    
    # Metode 2: Backup jika tidak ketemu
    if not kategori_akun:
        bio_sections = profile_soup.find_all('div', {'class': re.compile(r'_aa_c')})
        for section in bio_sections:
            text = section.get_text().strip()
            if text and len(text) < 100:  # Kategori biasanya pendek
                kategori_akun = text
                break
                
except Exception as e:
    kategori_akun = ''
    print(f"Error getting category: {e}")

print(f"✅ Kategori akun: {kategori_akun}")

# === Scroll agar semua post muncul ===
scroll_times = 50
for _ in range(scroll_times):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)

# === Ambil semua link post & reels ===
post_links = []
posts = driver.find_elements(By.XPATH, '//a[contains(@href, "/p/") or contains(@href, "/reel/")]')
for post in posts:
    href = post.get_attribute('href')
    if href not in post_links:
        post_links.append(href)

print(f"✅ Total post ditemukan: {len(post_links)}")

# Batasi (misal ambil 100 post)
post_links = post_links[:100]

# === DATA OUTPUT ===
data = []

# === Loop setiap post dengan error handling ===
for idx, link in enumerate(post_links):
    max_retries = 3
    retry_count = 0
    
    while retry_count < max_retries:
        try:
            # Restart driver jika bermasalah
            driver = restart_driver_if_needed(driver)
            
            driver.get(link)
            time.sleep(5)
            
            # Tentukan jenis konten
            if '/reel/' in link:
                media_type = 'reel'
            elif '/p/' in link:
                media_type = 'post'
            else:
                media_type = ''

            # Klik "Muat komentar lainnya" dengan lebih banyak variasi
            load_more_attempts = 0
            max_load_attempts = 10
            
            while load_more_attempts < max_load_attempts:
                try:
                    # Coba berbagai selector untuk tombol load more
                    load_more_selectors = [
                        "//button[.//svg[@aria-label='Muat komentar lainnya']]",
                        "//button[.//svg[@aria-label='Load more comments']]",
                        "//button[contains(text(), 'View more comments')]",
                        "//button[contains(text(), 'Lihat komentar lainnya')]",
                        "//span[contains(text(), 'View more comments')]",
                        "//span[contains(text(), 'Lihat komentar lainnya')]"
                    ]
                    
                    load_more_found = False
                    for selector in load_more_selectors:
                        try:
                            load_more = driver.find_element(By.XPATH, selector)
                            load_more.click()
                            load_more_found = True
                            time.sleep(2)
                            break
                        except NoSuchElementException:
                            continue
                    
                    if not load_more_found:
                        break
                        
                    load_more_attempts += 1
                    
                except Exception as e:
                    print(f"Error loading more comments: {e}")
                    break

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Ambil data
            brand = username_target

            try:
                # Cari caption dengan berbagai metode
                caption = ''
                
                # Metode 1: Cari h1 dengan class spesifik
                caption_h1 = soup.find('h1', class_='_ap3a _aaco _aacu _aacx _aad7 _aade')
                if caption_h1:
                    caption = caption_h1.get_text().strip()
                
                # Metode 2: Cari berdasarkan struktur yang lebih umum
                if not caption:
                    meta_desc = soup.find('meta', {'name': 'description'})
                    if meta_desc:
                        caption = meta_desc.get('content', '')
                
                # Metode 3: Cari span dengan teks panjang di area post
                if not caption:
                    spans = soup.find_all('span', dir='auto')
                    for span in spans:
                        text = span.get_text().strip()
                        if len(text) > 20:  # Caption biasanya lebih panjang
                            caption = text
                            break
                            
            except Exception as e:
                caption = ''
                print(f"Error getting caption: {e}")

            # Ambil likes dengan fungsi yang diperbaiki
            likes = get_likes_count(soup)

            # Ambil comments dengan fungsi yang diperbaiki
            comments_data, comments_count = get_comments_data(driver, soup)

            try:
                media = soup.find('img')
                if media:
                    media_url = media.get('src', '')
                else:
                    video = soup.find('video')
                    media_url = video.get('src', '') if video else ''
            except:
                media_url = ''

            # Ambil dominant color
            dominant_color = ''
            color_name = ''
            if media_url and media_type == 'post':
                try:
                    response = requests.get(media_url)
                    img = BytesIO(response.content)
                    ct = ColorThief(img)
                    dominant_color = ct.get_color(quality=1)
                    color_name = get_color_name(dominant_color)
                except Exception as e:
                    print(f"Error getting color: {e}")
                    dominant_color = ''
                    color_name = ''

            try:
                upload_time = soup.find('time')['datetime']
            except:
                upload_time = ''

            content_category = 'Batik Kultur post'

            # Gabungkan semua comments menjadi satu string
            all_comments = ' | '.join(comments_data) if comments_data else ''

            data.append((
                brand, link, caption, likes, comments_count, all_comments, media_url, media_type,
                str(dominant_color), color_name, content_category, upload_time, kategori_akun
            ))

            print(f"[{idx+1}] ✅ {link} | {media_type} | Comments: {comments_count} | Likes: {likes}")
            print(f"    📝 Comments preview: {all_comments[:100]}...")
            
            # Sukses, keluar dari retry loop
            break
            
        except Exception as e:
            retry_count += 1
            print(f"❌ Error pada post {idx+1} (retry {retry_count}/{max_retries}): {e}")
            
            if retry_count < max_retries:
                print("🔄 Mencoba ulang...")
                time.sleep(10)  # Tunggu lebih lama sebelum retry
            else:
                print(f"❌ Gagal setelah {max_retries} percobaan, skip post ini")
                # Tambahkan data kosong untuk post yang gagal
                data.append((
                    brand, link, '', '', 0, '', '', media_type,
                    '', '', content_category, '', kategori_akun
                ))
    
    time.sleep(5)  # Jeda antar post untuk stabilitas
    

# === Save ke Excel dengan error handling ===
df = pd.DataFrame(data, columns=[
    'brand', 'url_post', 'caption', 'likes', 'comments_count', 'all_comments',
    'media_url', 'media_type', 'dominant_color', 'color_name',
    'content_category', 'upload_time', 'profile_category'
])

df.to_excel('konten_ig_Batikula_baru.xlsx', index=False)
print("\n✅ Selesai! Data disimpan ke 'konten_ig_Batikula_baru.xlsx'")

driver.quit()