In [2]:
import json
import logging
import time
from datetime import datetime
from typing import Dict, List, Optional

from bs4 import BeautifulSoup
from bs4.element import PageElement
from pydantic import BaseModel
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.remote.webelement import WebElement

In [3]:
class CustomFormatter(logging.Formatter):
    log_format = "[%(asctime)s.%(msecs)03d] [%(levelname)s] %(message)s"

    def __init__(self):
        super().__init__(self.log_format, datefmt="%Y-%m-%d %H:%M:%S")


# remove any existing handlers to prevent double logging
if logging.getLogger().hasHandlers():
    logging.getLogger().handlers.clear()

handler = logging.StreamHandler()
handler.setFormatter(CustomFormatter())

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(handler)


def log_func(func):
    def wrapper(*args, **kwargs):
        logger.info(f"[{func.__name__}] args: {args}, kwargs: {kwargs}")
        return func(*args, **kwargs)

    return wrapper


logger.info("Logger setup complete")

[2024-10-25 12:41:06.913] [INFO] Logger setup complete


In [4]:
class Post(BaseModel):
    caption: Optional[str]  # Caption can be None or a string
    comments: Optional[List[Optional[str]]]  # Comments can be None or a list of strings


class DatasetModel(BaseModel):
    data: Dict[str, Post]  # URL keys with Post values
    author: Optional[str] = (
        "Putu Widyantara Artanta Wibawa"
    )
    updated_at: Optional[str] = None


class Dataset:
    def __init__(self, data_dict=None):
        """Initialize with a dictionary and store it in self.data after validation."""
        if data_dict is None:
            data_dict = {}
        self.data = DatasetModel(data=data_dict)

    @classmethod
    def from_json(cls, json_file):
        """Load JSON file, validate it, and return a new Dataset instance."""
        try:
            with open(json_file, "r") as file:
                json_data = json.load(file)
                return cls(data_dict=json_data.get("data", {}))
        except FileNotFoundError:
            logging.error(f"Error: {json_file} not found.")
            return cls()
        except json.JSONDecodeError:
            logging.error(f"Error: Could not decode JSON from {json_file}.")
            return cls()
        except ValueError as e:
            logging.error(f"Validation error: {e}")
            return cls()

    def to_json(self, json_file):
        """Save self.data to a JSON file."""
        with open(json_file, "w") as file:
            self.data.updated_at = datetime.now().isoformat()
            json.dump(self.data.model_dump(), file, indent=4)

In [7]:
TWITTER_BASE_URL = "https://x.com"
INSTAGRAM_BASE_URL = "https://www.instagram.com"
FACEBOOK_BASE_URL = "https://www.facebook.com"
TIKTOK_BASE_URL = "https://www.tiktok.com"

# Instagram

In [5]:
webdriver = Chrome()



In [157]:
webdriver.get(INSTAGRAM_BASE_URL)

# need login first, so wait for user to login
# time.sleep(60)

In [6]:
@log_func
def show_first_post_ig(url: str):
    try:
        webdriver.get(url)
        time.sleep(2)
        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        divs = soup.find_all(
            "div",
            class_="x1lliihq x1n2onr6 xh8yej3 x4gyw5p xfllauq xo2y696 x11i5rnm x2pgyrj",
        )
        list_urls = []

        for div in divs:
            a_tag = div.find("a", recursive=False)
            if a_tag and "href" in a_tag.attrs:
                list_urls.append(a_tag["href"])

        element = webdriver.find_element(By.XPATH, f'//a[@href="{list_urls[0]}"]')
        element.click()
    except Exception as e:
        logger.error(str(e).split("\n")[0])

In [7]:
@log_func
def get_caption_ig() -> Optional[str]:
    try:
        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        divs = soup.find_all("div", class_="_a9zs")
        for div in divs:
            h1_tag = div.find(
                "h1", class_="_ap3a _aaco _aacu _aacx _aad7 _aade", recursive=False
            )
            for br in h1_tag.find_all("br"):
                br.replace_with("\n")
            if h1_tag:
                return h1_tag.text
        return None
    except Exception as e:
        logger.error(str(e).split("\n")[0])
        return None

In [11]:
def _get_element_xpath(element: WebElement) -> Optional[str]:
    try:
        full_xpath = webdriver.execute_script(
            """
            function getElementXPath(element) {
                if (element.id !== '') {
                    return 'id("' + element.id + '")';
                }
                if (element === document.body) {
                    return element.tagName.toLowerCase();
                }

                let ix = 0;
                const siblings = element.parentNode.childNodes;
                let sameTagSiblings = 0;

                for (let i = 0; i < siblings.length; i++) {
                    if (siblings[i].nodeType === 1 && siblings[i].tagName === element.tagName) {
                        sameTagSiblings++;
                    }
                }
                
                for (let i = 0; i < siblings.length; i++) {
                    const sibling = siblings[i];
                    if (sibling === element) {
                        let text = "";
                        
                        if (sameTagSiblings > 1) {
                            text = '[' + (ix + 1) + ']';
                        }
                        
                        return getElementXPath(element.parentNode) + '/' + element.tagName.toLowerCase() + text;
                    }

                    if (sibling.nodeType === 1 && sibling.tagName === element.tagName) {
                        ix++;
                    }
                }
            }
            return getElementXPath(arguments[0]);

        """,
            element,
        )
        result = f"/html/{full_xpath}"
        return result
    except Exception as e:
        logger.error(str(e).split("\n")[0])
        return None

In [9]:
@log_func
def load_more_comments_ig():
    try:
        title = webdriver.find_element(
            By.XPATH, "//*[contains(text(), 'Load more comments')]"
        )
        if title:
            is_found = True
            while is_found:
                try:
                    title = webdriver.find_element(
                        By.XPATH, "//*[contains(text(), 'Load more comments')]"
                    )
                    title_xpath = _get_element_xpath(title)
                    button_xpath = title_xpath[
                        : title_xpath.rfind("button") + len("button")
                    ]
                    try:
                        button_element = webdriver.find_element(By.XPATH, button_xpath)
                        button_element.click()
                    except Exception as e:
                        logger.error(str(e).split("\n")[0])
                        is_found = False
                except Exception as e:
                    logger.error(str(e).split("\n")[0])
                    is_found = False
    except Exception as e:
        logger.error(str(e).split("\n")[0])

In [10]:
@log_func
def show_replies_ig():
    try:

        button = webdriver.find_elements(
            "xpath", "//button[contains(@class, '_acan _acao _acas _aj1- _ap30')]"
        )
        result_button = [
            b
            for b in button
            if (b.text.startswith("View replies") or b.text.startswith("View all"))
        ]
        total_button = len(result_button)
        if total_button > 0:
            for b in result_button:
                b.click()
            logger.info(f"Total button clicked: {total_button}")
        else:
            logger.warning("No replies found")
    except Exception as e:
        logger.error(str(e).split("\n")[0])

In [11]:
@log_func
def get_comments_ig() -> list[str]:
    try:
        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        comments = soup.find_all("div", class_="_a9zs")
        result = []
        for div in comments:
            span_tag = div.find(
                "span", class_="_ap3a _aaco _aacu _aacx _aad7 _aade", recursive=False
            )
            if span_tag:
                result.append(span_tag.text)
        logger.info(f"Total comments found: {len(result)}")
        return result
    except Exception as e:
        logger.error(str(e).split("\n")[0])
        return list()

In [12]:
@log_func
def next_post_ig():
    try:
        button = webdriver.find_element(
            By.XPATH,
            f'//span[@style="display: inline-block; transform: rotate(90deg);"]',
        )
        button.click()
    except Exception as e:
        logger.error(str(e).split("\n")[0])


@log_func
def has_next_post_ig() -> bool:
    try:
        webdriver.find_element(
            By.XPATH,
            f'//span[@style="display: inline-block; transform: rotate(90deg);"]',
        )
        return True
    except Exception as e:
        logger.error(str(e).split("\n")[0])
        return False

In [13]:
@log_func
def _get_single_post_data_ig() -> Post:
    load_more_comments_ig()
    show_replies_ig()
    caption = get_caption_ig()
    comments = get_comments_ig()
    return Post(caption=caption, comments=comments)

In [168]:
def scraping_instagram(username: str, max_posts: Optional[int] = -1) -> Dataset:
    try:
        if max_posts == 0:
            return Dataset()

        result = Dataset()
        url = f"{INSTAGRAM_BASE_URL}/{username}/"
        show_first_post_ig(url)

        # get data
        post_data = _get_single_post_data_ig()
        # 'https://www.instagram.com/p/:POST_ID/?img_index=1'
        post_id = webdriver.current_url
        result.data.data.update({post_id: post_data})

        max_posts -= 1 # because we already get the first post
        if max_posts == -1:
            while has_next_post_ig():
                next_post_ig()
                time.sleep(2)
                post_data = _get_single_post_data_ig()
                post_id = webdriver.current_url
                result.data.data.update({post_id: post_data})
        else:
            while max_posts and has_next_post_ig():
                next_post_ig()
                max_posts -= 1
                time.sleep(2)
                post_data = _get_single_post_data_ig()
                post_id = webdriver.current_url
                result.data.data.update({post_id: post_data})
            if max_posts:
                logger.warning("Total post less than expected")

        # stats
        scraped_posts = len(result.data.data)
        scraped_comments = sum(
            len(post.comments) for post in result.data.data.values()
        )
        logger.info(f"Total post scraped: {scraped_posts}")
        logger.info(f"Total comments scraped: {scraped_comments}")

        return result
    except Exception as e:
        logger.error(str(e).split("\n")[0])
        return Dataset()

In [169]:
instagram_dataset = scraping_instagram("putu_waw")

[2024-10-17 22:06:13.642] [INFO] [show_first_post]
[2024-10-17 22:06:17.310] [INFO] [_get_single_post_data]
[2024-10-17 22:06:17.312] [INFO] [load_more_comments]
[2024-10-17 22:06:17.339] [ERROR] Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[contains(text(), 'Load more comments')]"}
[2024-10-17 22:06:17.340] [INFO] [show_replies]
[2024-10-17 22:06:17.385] [INFO] [get_caption]
[2024-10-17 22:06:17.900] [INFO] [get_comments]
[2024-10-17 22:06:18.430] [INFO] Total comments found: 0
[2024-10-17 22:06:18.439] [INFO] [has_next_post]
[2024-10-17 22:06:18.453] [INFO] [next_post]
[2024-10-17 22:06:20.585] [INFO] [_get_single_post_data]
[2024-10-17 22:06:20.586] [INFO] [load_more_comments]
[2024-10-17 22:06:20.604] [ERROR] Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[contains(text(), 'Load more comments')]"}
[2024-10-17 22:06:20.605] [INFO] [show_replies]
[2024-10-17 22:06:20.704] [INFO] Total button clicked: 1
[202

In [173]:
instagram_dataset.data.model_dump()

{'data': {'https://www.instagram.com/p/C3W-SslrQpp/': {'caption': 'Halo seluruh mahasiswa Indonesia. Saya siap mengikuti Magang dan Studi Independen Bersertifikat Angkatan 6!',
   'comments': []},
  'https://www.instagram.com/p/Cv4iiXPLDl6/': {'caption': 'Halo! Saya Putu Widyantara Artanta Wibawa dari Universitas Udayana siap mengikuti National Onboarding MSIB Angkatan 5!\n\n#BerprosesLebihBaik #KampusMerdeka #MSIB5 #MagangMerdeka #MagangBersertifikat #BukanMagangdanStudiBiasa #MSIB5',
   'comments': ['Mangaaat', '🔥', 'Great My son😍', 'Semangat frenn🔥']},
  'https://www.instagram.com/p/CZqhWPWlNYN/': {'caption': '[SAYA SIAP MENGIKUTI MAHASISYA UPANAYANA XIX]\n\nOm Swastyastu 🙏\n"Om Ano Bhadrah Kratavo Yantu Visvatah" - (Yajur Veda XXV. 14)\n(Semoga pikiran yang baik datang dari segala penjuru)\n\nMahasisya Upanayana merupakan upacara penyucian diri dengan tujuan memohon doa restu secara niskala tatkala seorang mahasiswa akan menuntut ilmu dan berguru di Universitas Udayana.\n\nSaya Put

In [180]:
# current_dataset = Dataset.from_json("aneh.json")
# current_dataset.data.data.update(instagram_dataset.data.data)
# current_dataset.to_json("aneh.json")

# Twitter

In [None]:
webdriver = Chrome()

In [181]:
webdriver.get(TWITTER_BASE_URL)

In [182]:
@log_func
def _scraping_profile_tweet(dataset: Dataset, post: Optional[int] = -1):
    history = list()

    while True:
        webdriver.execute_script("window.scrollBy(0, 300);")
        time.sleep(0.3)

        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        divs = soup.find_all("div", attrs={"data-testid": "tweetText"})

        url_list = []
        div_elements = soup.find_all("div", class_="css-175oi2r r-18u37iz r-1q142lx")
        for div_element in div_elements:
            a_tag = div_element.find("a", recursive=False)
            if a_tag and "href" in a_tag.attrs:
                url_list.append(a_tag["href"])

        min_idx = min(len(divs), len(url_list))
        for idx in range(min_idx):
            url = f"{TWITTER_BASE_URL}{url_list[idx]}"
            dataset.data.data.update(
                {url: Post(caption=divs[idx].text, comments=[])}
            )

        length_data = len(dataset.data.data)
        logger.info(f"Total tweets scraped: {length_data}")
        if post != -1 and length_data >= post:
            break
        history.append(length_data)

        if len(history) > 10:
            if history[-10] == history[-1]:
                logger.info("No new tweets found")
                break

In [183]:
@log_func
def _scraping_tweet_comment():
    result = list()
    history = list()

    while True:
        webdriver.execute_script("window.scrollBy(0, 300);")
        time.sleep(0.3)

        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        divs = soup.find_all("div", attrs={"data-testid": "tweetText"})

        for div in divs:
            if div.text not in result:
                result.append(div.text)

        logger.info(f"Total comment tweets scraped: {len((result))}")
        history.append(len(result))

        if len(history) > 10:
            if history[-10] == history[-1]:
                logger.info("No new comment found")
                break
    return result

In [184]:
def scraping_twitter(username: str, max_posts: Optional[int] = -1) -> Dataset:
    dataset = Dataset()
    webdriver.get(f"{TWITTER_BASE_URL}/{username}")
    _scraping_profile_tweet(dataset, max_posts)

    for url in dataset.data.data.keys():
        webdriver.get(f"{url}")
        logger.info(f"Scraping comments for tweet: {url}")
        time.sleep(4)

        comments = _scraping_tweet_comment()

        current_caption = dataset.data.data.get(url).caption
        comments.remove(current_caption)

        logger.info(f"Final comments scraped: {len(comments)}")
        dataset.data.data.get(url).comments = comments
    return dataset

In [190]:
twitter_dataset = scraping_twitter("putu_waw")

[2024-10-17 22:19:59.310] [INFO] [_scraping_profile_tweet]
[2024-10-17 22:19:59.887] [INFO] Total tweets scraped: 0
[2024-10-17 22:20:00.298] [INFO] Total tweets scraped: 0
[2024-10-17 22:20:00.677] [INFO] Total tweets scraped: 0
[2024-10-17 22:20:01.057] [INFO] Total tweets scraped: 0
[2024-10-17 22:20:01.579] [INFO] Total tweets scraped: 0
[2024-10-17 22:20:02.445] [INFO] Total tweets scraped: 3
[2024-10-17 22:20:02.957] [INFO] Total tweets scraped: 3
[2024-10-17 22:20:03.372] [INFO] Total tweets scraped: 4
[2024-10-17 22:20:03.753] [INFO] Total tweets scraped: 4
[2024-10-17 22:20:04.146] [INFO] Total tweets scraped: 4
[2024-10-17 22:20:04.577] [INFO] Total tweets scraped: 4
[2024-10-17 22:20:05.024] [INFO] Total tweets scraped: 5
[2024-10-17 22:20:05.493] [INFO] Total tweets scraped: 6
[2024-10-17 22:20:05.899] [INFO] Total tweets scraped: 6
[2024-10-17 22:20:06.389] [INFO] Total tweets scraped: 7
[2024-10-17 22:20:06.869] [INFO] Total tweets scraped: 7
[2024-10-17 22:20:07.337] [IN

In [191]:
twitter_dataset.data.model_dump()

{'data': {'https://x.com/putu_waw/status/1747615071537361226': {'caption': "The waiting is over! I'm very happy because I have already get the final transcript for Bangkit 2023. I hope I can become one of the Bangkit distinct graduation \n#lifeatbangkit",
   'comments': []},
  'https://x.com/putu_waw/status/1747614627301867992': {'caption': 'Finally, I completed 2 optional courses given by Banfkit about TensorFlow Advance Technique and NLP. Next is completing the Dicoding course hehe\n\n#lifeatbangkit',
   'comments': []},
  'https://x.com/putu_waw/status/1747613877423206496': {'caption': "Hi everyone, I'm very happy to share with you that finally I completed all of the course at Bangkit. Letsgoo\n#lifeatbangkit",
   'comments': []},
  'https://x.com/streamlit/status/1690045031640375296': {'caption': " Putu Widyantara Artanta Wibawa \n\n@putu_waw's @CockroachDB Connection! The demo app shows how to build the connection and query the database.\n\n Connection: https://buff.ly/4412DWs\n A

# Facebook

In [5]:
webdriver = Chrome()

In [6]:
webdriver.get(FACEBOOK_BASE_URL)

In [12]:
def _get_xpath_from_bs4_element(element: PageElement) -> Optional[str]:
    try:
        components = []
        while element:
            siblings = element.find_previous_siblings(element.name)
            if siblings:  # only add index if there are siblings
                index = len(siblings) + 1
                components.append(f"{element.name}[{index}]")
            else:
                components.append(f"{element.name}")
            element = element.parent
        result = "/" + "/".join(reversed(components))
        result = result.replace("/[document]", "")
        return result
    except Exception as e:
        logger.error(str(e))
        return None

In [190]:
@log_func
def _get_reels_post_id_fb() -> set:
    result = set()
    try:
        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        reels_anchor = soup.find_all("a", class_="x1i10hfl x1qjc9v5 xjbqb8w xjqpnuy xa49m3k xqeqjp1 x2hbi6w x13fuv20 xu3j5b3 x1q0q8m5 x26u7qi x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xdl72j9 x2lah0s xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r x2lwn1j xeuugli xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1ja2u2z x1t137rt x1o1ewxj x3x9cwd x1e5q0jg x13rtm0m x1q0g3np x87ps6o x1lku1pv x1rg5ohu x1a2a7pz x1n2onr6 xh8yej3")
        for a in reels_anchor:
            url = "/".join(a['href'].split("/")[:3])
            url = f"{FACEBOOK_BASE_URL}{url}"
            result.add(url)
        return result
    except Exception as e:
        logger.error(str(e))
        return result

In [191]:
@log_func
def _show_reels_caption_fb():
    see_more_element = webdriver.find_elements(
        by=By.XPATH, value="//div[contains(text(), 'See more')]"
    )
    for element in see_more_element:
        try:
            element.click()
        except Exception as e:
            logger.error(str(e).split("\n")[0])

In [192]:
@log_func
def get_reels_caption_fb():
    # show full caption
    _show_reels_caption_fb()
    time.sleep(1)

    caption = ""
    soup = BeautifulSoup(webdriver.page_source, "html.parser")
    outer_div = soup.find_all("div", class_="xyamay9 x1pi30zi x1swvt13 xjkvuk6")
    for div in outer_div:
        spans = div.find_all("span", class_="x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm x17z8epw")
        for span in spans:
            caption += span.text
    return caption

In [193]:
def _show_replied_more_comments_fb():
    replied_element = webdriver.find_elements(
        by=By.XPATH, value="//span[contains(text(), 'replied')]"
    )
    for element in replied_element:
        try:
            element.click()
        except Exception as e:
            logger.error(str(e).split("\n")[0])

        # show more comment
    more_comment_element = webdriver.find_elements(
        by=By.XPATH, value="//span[contains(text(), 'more comments')]"
    )
    for element in more_comment_element:
        try:
            element.click()
        except Exception as e:
            logger.error(str(e).split("\n")[0])

In [194]:
@log_func
def _show_reels_comment_fb():
    comment_button = webdriver.find_elements(
        by=By.XPATH, value="//div[@aria-label='Comment']"
    )
    for element in comment_button:
        try:
            element.click()
        except Exception as e:
            logger.error(str(e).split("\n")[0])

In [195]:
@log_func
def get_reels_comment_fb(
    show_comment: bool = True,
    previous_comments: Optional[int] = 0,
    current_iteration: Optional[int] = 1,
    max_iteration: Optional[int] = 20,
):
    # show comment section
    if show_comment:
        _show_reels_comment_fb()
        time.sleep(2)

    # get captions reels
    result = []
    soup = BeautifulSoup(webdriver.page_source, "html.parser")
    outer_span = soup.find_all(
        "span",
        class_="x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u",
    )
    for div in outer_span:
        divs = div.find_all("div", class_="xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs")
        for d in divs:
            anchors = d.find_all("a")
            replied_username = None
            for a in anchors:
                if a:
                    replied_username = a.text
            comment: str = (" ".join(div.stripped_strings))
            if replied_username and comment.startswith(replied_username):
                comment = comment[len(replied_username)+1:] # +1 to remove space
                result.append(comment)
            else:
                result.append(comment)

    if len(result) > previous_comments and current_iteration < max_iteration:
        _show_replied_more_comments_fb()
        time.sleep(3)
        return get_reels_comment_fb(
            show_comment=False,
            previous_comments=len(result),
            current_iteration=current_iteration + 1,
        )

    return result

In [196]:
@log_func
def _get_video_caption_fb():
    caption = ""
    new_soup = BeautifulSoup(webdriver.page_source, "html.parser")
    outer_divs = new_soup.find_all("div", class_="x1swvt13 x1pi30zi xyamay9")
    for outer_div in outer_divs:
        spans = outer_div.find_all(
            "span",
            class_="x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u",
        )
        for span in spans:
            divs = span.find_all(
                "div", class_="xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs"
            )
            for div in divs:
                caption += div.text
    return caption

In [197]:
@log_func
def _get_video_comment_fb(
    previous_comments: Optional[int] = 0,
    current_iteration: Optional[int] = 1,
    max_iteration: Optional[int] = 20,
):
    result = []
    soup = BeautifulSoup(webdriver.page_source, "html.parser")
    outer_span = soup.find_all(
        "span",
        class_="x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u",
    )
    for span in outer_span:
        divs = span.find_all(
            "div",
            class_="xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs",
        )
        for div in divs:
            anchors = div.find_all("a")
            replied_username = None
            for a in anchors:
                if a:
                    replied_username = a.text
            comment: str = " ".join(div.stripped_strings)
            if replied_username and comment.startswith(replied_username):
                comment = comment[len(replied_username) + 1 :]  # +1 to remove space
                result.append(comment)
            else:
                result.append(comment)

    logger.info(f"Total comments found: {len(result)} vs {previous_comments}")
    if len(result) > previous_comments and current_iteration < max_iteration:
        _show_replied_more_comments_fb()
        time.sleep(3)
        return _get_video_comment_fb(
            previous_comments=len(result), current_iteration=current_iteration + 1
        )

    return result

In [198]:
@log_func
def get_caption_fb():
    if "/videos" in webdriver.current_url or "/watch" in webdriver.current_url:
        # can't be used outside, because /videos give different UI result
        caption = _get_video_caption_fb()
        return caption

    if "/reel" in webdriver.current_url:
        caption = get_reels_caption_fb()
        return caption

    caption = ""
    new_soup = BeautifulSoup(webdriver.page_source, "html.parser")
    outer_divs = new_soup.find_all(
        "div",
        class_="x1l90r2v x1pi30zi x1swvt13 x1iorvi4",
        attrs={"data-ad-preview": "message"},
    )
    for d in outer_divs:
        new_divs = d.find_all("div", class_="xu06os2 x1ok221b")
        for div in new_divs:
            span = div.find_all(
                "span",
                class_="x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u x1yc453h",
            )
            for s in span:
                caption += s.text
    return caption

In [199]:
@log_func
def get_post_id_fb(max_posts: int = -1) -> List[str]:
    result = set()
    history = list()

    while True:
        webdriver.execute_script("window.scrollBy(0, 300);")
        time.sleep(0.3)

        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        spans = soup.find_all(
            "span",
            class_="x4k7w5x x1h91t0o x1h9r5lt x1jfb8zj xv2umb2 x1beo9mf xaigb6o x12ejxvf x3igimt xarpa2k xedcshv x1lytzrv x1t2pt76 x7ja8zs x1qrby5j",
        )
        for span in spans:
            a_tags = span.find_all(
                "a",
                class_="x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1sur9pj xkrqix3 xi81zsa x1s688f",
            )
            for a_tag in a_tags:
                if "href" in a_tag.attrs:
                    post_id: str = a_tag["href"]
                    if post_id.startswith("https"):
                        params_idx = post_id.find("?")
                        if params_idx != -1:
                            result.add(post_id[:params_idx])
                        else:
                            result.add(post_id)
                    else:
                        # href not converted into post id
                        # need to hover on the link to make it change
                        logger.warning("Found href not converted into post id")
                        try:
                            xpath = _get_xpath_from_bs4_element(a_tag)
                            element = webdriver.find_element(By.XPATH, xpath)
                            action = ActionChains(webdriver)
                            action.move_to_element(element).perform()
                            time.sleep(0.3)
                        except Exception as e:
                            logger.error(str(e).split("\n")[0])

        # search for reels id
        reels_id = _get_reels_post_id_fb()
        result.update(reels_id)

        history.append(len(result))
        if max_posts != -1 and len(result) >= max_posts:
            logger.info("Break because max posts reached")
            break

        logger.info(f"Total post id scraped: {len(result)}")
        if len(history) > 5:
            if history[-5] == history[-1]:
                logger.info("No new post found")
                break

    return list(result)

In [200]:
@log_func
def get_comments_fb():
    result = list()
    history = list()

    if "/videos" in webdriver.current_url or "/watch" in webdriver.current_url:
        # can't be used outside, because /videos give different UI result
        caption = _get_video_comment_fb()
        return caption

    if "/reel" in webdriver.current_url:
        result = get_reels_comment_fb()
        return result
    
    repeat = True
    while repeat:
        webdriver.execute_script("window.scrollBy(0, 300);")

        # click all replied
        try:
            replied_buttons = webdriver.find_elements(By.XPATH, "//span[contains(text(), 'replied')]")
            for element in replied_buttons:
                element.click()

            more_comments_button = webdriver.find_elements(By.XPATH, "//span[contains(text(), 'more comments')]")
            for element in more_comments_button:
                element.click()
        except Exception as e:
            logger.error(str(e).split("\n")[0])
        time.sleep(0.3)

        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        divs = soup.find_all("div", class_="xwib8y2 xn6708d x1ye3gou x1y1aw1k")

        history.append(len(divs))
        logger.info(f"Searching more comments, found: {len(divs)}")
        if len(history) > 10:
            if history[-10] == history[-1]:
                logger.info("No new comments found")
                break

    logger.info("Start scrapping comments")
    soup = BeautifulSoup(webdriver.page_source, "html.parser")
    spans = soup.find_all("span", class_="x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u")
    for span in spans:
        divs = span.find_all("div", class_="xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs")
        for div in divs:
            anchors = div.find_all("a")
            replied_username = None
            for a in anchors:
                if a:
                    replied_username = a.text
            comment: str = (" ".join(div.stripped_strings))
            if replied_username and comment.startswith(replied_username):
                comment = comment[len(replied_username)+1:] # +1 to remove space
                result.append(comment)
            else:
                result.append(comment)
    logger.info(f"Total comments scraped: {len(result)}")
    
    return result

In [201]:
def scraping_facebook(user_id, max_posts=-1):
    dataset = Dataset()

    webdriver.get(f"{FACEBOOK_BASE_URL}/{user_id}")
    list_post_id = get_post_id_fb(max_posts=max_posts)
    for url in list_post_id:
        logger.info(f"Scraping post: {url}")
        webdriver.get(url)
        time.sleep(5)

        caption = get_caption_fb()
        comments = get_comments_fb()
        post = Post(caption=caption, comments=comments)
        dataset.data.data.update({url: post})

    return dataset

In [202]:
facebook_dataset = scraping_facebook("putu.widyantara.3")

[2024-10-24 00:41:29.559] [INFO] [get_post_id_fb] args: (), kwargs: {'max_posts': -1}
[2024-10-24 00:41:30.994] [INFO] [_get_reels_post_id_fb] args: (), kwargs: {}
[2024-10-24 00:41:31.373] [INFO] Total post id scraped: 1
[2024-10-24 00:41:33.341] [INFO] [_get_reels_post_id_fb] args: (), kwargs: {}
[2024-10-24 00:41:33.792] [INFO] Total post id scraped: 2
[2024-10-24 00:41:35.272] [INFO] [_get_reels_post_id_fb] args: (), kwargs: {}
[2024-10-24 00:41:35.690] [INFO] Total post id scraped: 4
[2024-10-24 00:41:37.027] [INFO] [_get_reels_post_id_fb] args: (), kwargs: {}
[2024-10-24 00:41:37.455] [INFO] Total post id scraped: 4
[2024-10-24 00:41:38.297] [INFO] [_get_reels_post_id_fb] args: (), kwargs: {}
[2024-10-24 00:41:38.672] [INFO] Total post id scraped: 5
[2024-10-24 00:41:39.393] [INFO] [_get_reels_post_id_fb] args: (), kwargs: {}
[2024-10-24 00:41:39.777] [INFO] Total post id scraped: 5
[2024-10-24 00:41:40.610] [INFO] [_get_reels_post_id_fb] args: (), kwargs: {}
[2024-10-24 00:41:41

In [203]:
facebook_dataset.data.model_dump()

{'data': {'https://web.facebook.com/putu.widyantara.3/posts/pfbid06ykeRc5eYQ39ovDYB3vBeS1tCYAK6Q5bNZBUm6FxRikd9w586bGuZzzhrg6GSXpQl': {'caption': 'Melepas rasa penat setelah UAS 1... Traveling to Lovina',
   'comments': []},
  'https://web.facebook.com/putu.widyantara.3/posts/pfbid023zyuMmC2RpMeFtEGa71jgTbvrBGru9kNX1QHJXDQWej8cQF9mgUm9FGA6uEfQcSBl': {'caption': '[Late Post]Serah terima jabatan kepengurusan OSIS SMA Negeri 1 Seririt Masa Bhakti 2017/2018 ke OSIS SMA Negeri 1 Seririt Masa Bhakti 2018/2019. Good luck!',
   'comments': ['Adikku mn kok gk klhtn yah hehe',
    'Ada kok Bu Herlina Wati , no 8 dari kanan',
    'bes cenik2 sing tpuk',
    'Pt jadi osis y...',
    'Ndak Om Artana Putu , itu dokumentasi dr pelantikan OSIS masa bhakti 2018/2019, nnti klo di tahunnya Putu 2019/2020.',
    'yy...mudah2n nti trpilih jdi osis...ikuti j kgiatn2 osis...']},
  'https://web.facebook.com/tuti.andayani/posts/pfbid0d7mdbWNL2LSrnXwXDRZ76K78UXe2v18GafUXj5JrJSgLTcRu39CUu3M2jAigyDrZl': {'caption

# TikTok

In [181]:
webdriver = Chrome()

In [182]:
webdriver.get(TIKTOK_BASE_URL)

In [183]:
def _is_need_captcha_tiktok():
    try:
        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        captcha = soup.find("div", class_="TUXModal captcha-verify-container")
        if captcha:
            logger.warning("CAPTCHA detected")
            return True
        return False
    except Exception as e:
        logger.error(str(e))
        return False

In [184]:
@log_func
def _await_for_captcha_resolved_tiktok():
    while _is_need_captcha_tiktok():
        logger.warning("Pending process. Please resolve CAPTCHA...")
        time.sleep(5)
    logger.info("CAPTCHA resolved")

In [185]:
@log_func
def get_caption_tiktok():
    _await_for_captcha_resolved_tiktok()
    soup = BeautifulSoup(webdriver.page_source, "html.parser")
    h1 = soup.find_all("h1", class_="css-1fbzdvh-H1Container ejg0rhn1")
    caption = ""
    for h in h1:
        caption += h.text
    return caption

In [186]:
@log_func
def get_comments_tiktok():
    result = set()
    history = list()

    while True:
        try:
            _await_for_captcha_resolved_tiktok()

            # scrolling
            div_elements = webdriver.find_element(
                By.CSS_SELECTOR, "div.css-1qp5gj2-DivCommentListContainer.ekjxngi3"
            )
            webdriver.execute_script("arguments[0].scrollTop += 300;", div_elements)
            time.sleep(2)

            # view replies
            replies = webdriver.find_elements(
                By.CSS_SELECTOR, "p.css-1flplee-PReplyActionText.eo72wou4"
            )
            for r in replies:
                try:
                    button_status = r.get_attribute("data-e2e")
                    if button_status != "comment-hide":
                        r.click()
                except Exception as e:
                    logger.error(str(e).split("\n")[0])
        except Exception as e:
            logger.error(str(e).split("\n")[0])

        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        paragraphs = soup.find_all("p", class_="css-xm2h10-PCommentText e1g2efjf6")
        for p in paragraphs:
            result.add(p.text)

        logger.info(f"Total comment scraped: {len((result))}")
        history.append(len(result))

        if len(history) > 10:
            if history[-10] == history[-1]:
                logger.info("No new comment found")
                break
    return list(result)

In [187]:
@log_func
def next_post_tiktok():
    try:
        next_button = webdriver.find_element(
            By.CSS_SELECTOR,
            "button.css-1s9jpf8-ButtonBasicButtonContainer-StyledVideoSwitch.e11s2kul11",
        )
        next_button.click()
    except Exception as e:
        logger.error(str(e).split("\n")[0])


@log_func
def has_next_post_tiktok() -> bool:
    try:
        # next button
        next_button = webdriver.find_element(
            By.CSS_SELECTOR,
            "button.css-1s9jpf8-ButtonBasicButtonContainer-StyledVideoSwitch.e11s2kul11",
        )
        # return True if disabled attribute is None
        return next_button.get_attribute("disabled") == None
    except Exception as e:
        logger.error(str(e).split("\n")[0])
        return False

In [203]:
@log_func
def show_first_post_tiktok():
    try:
        _await_for_captcha_resolved_tiktok()
        soup = BeautifulSoup(webdriver.page_source, "html.parser")
        divs = soup.find_all("div", class_="css-1uqux2o-DivItemContainerV2 e19c29qe17")

        xpath =_get_xpath_from_bs4_element(divs[0])
        element = webdriver.find_elements(By.XPATH, xpath)
        element[0].click()
    except Exception as e:
        logger.error(str(e).split("\n")[0])

In [189]:
def _get_single_post_data_tiktok(dataset: Dataset):
    _await_for_captcha_resolved_tiktok()
    caption = get_caption_tiktok()
    comments = get_comments_tiktok()
    post = Post(caption=caption, comments=comments)
    dataset.data.data.update({webdriver.current_url: post})

In [204]:
def scraping_tiktok(username: str, max_posts: Optional[int] = -1) -> Dataset:
    try:
        if max_posts == 0:
            return Dataset()

        result = Dataset()
        url = f"{TIKTOK_BASE_URL}/@{username}"
        webdriver.get(url)
        time.sleep(5)

        show_first_post_tiktok()
        _get_single_post_data_tiktok(result)
        max_posts -= 1
        
        if max_posts == -1:
            while has_next_post_tiktok():
                next_post_tiktok()
                time.sleep(2)
                _get_single_post_data_tiktok(result)
        else:
            while max_posts and has_next_post_tiktok():
                next_post_tiktok()
                max_posts -= 1
                time.sleep(2)
                _get_single_post_data_tiktok(result)
            if max_posts:
                logger.warning("Total post less than expected")

        # stats
        scraped_posts = len(result.data.data)
        scraped_comments = sum(len(post.comments) for post in result.data.data.values())

        logger.info(f"Total post scraped: {scraped_posts}")
        logger.info(f"Total comments scraped: {scraped_comments}")
        return result
    except Exception as e:
        logger.error(str(e).split("\n")[0])
        return Dataset()

In [205]:
tiktok_dataset = scraping_tiktok("lanmalajah.id", max_posts=5)

[2024-10-25 16:23:46.845] [INFO] [show_first_post_tiktok] args: (), kwargs: {}
[2024-10-25 16:23:46.846] [INFO] [_await_for_captcha_resolved_tiktok] args: (), kwargs: {}
[2024-10-25 16:23:57.101] [INFO] CAPTCHA resolved
[2024-10-25 16:23:57.510] [INFO] [_await_for_captcha_resolved_tiktok] args: (), kwargs: {}
[2024-10-25 16:23:57.656] [INFO] CAPTCHA resolved
[2024-10-25 16:23:57.657] [INFO] [get_caption_tiktok] args: (), kwargs: {}
[2024-10-25 16:23:57.658] [INFO] [_await_for_captcha_resolved_tiktok] args: (), kwargs: {}
[2024-10-25 16:23:57.754] [INFO] CAPTCHA resolved
[2024-10-25 16:23:57.860] [INFO] [get_comments_tiktok] args: (), kwargs: {}
[2024-10-25 16:23:57.860] [INFO] [_await_for_captcha_resolved_tiktok] args: (), kwargs: {}
[2024-10-25 16:23:58.030] [INFO] CAPTCHA resolved
[2024-10-25 16:24:00.205] [INFO] Total comment scraped: 12
[2024-10-25 16:24:00.205] [INFO] [_await_for_captcha_resolved_tiktok] args: (), kwargs: {}
[2024-10-25 16:24:00.424] [INFO] CAPTCHA resolved
[2024-

In [206]:
tiktok_dataset.data.model_dump()

{'data': {'https://www.tiktok.com/@lanmalajah.id/video/7428950379175496966': {'caption': 'Wenten gatra becik semeton, sarengin nggih 😇🙏',
   'comments': ['mntap',
    'Ikut mb',
    'daftar siki',
    '🥰',
    'ikut geg 😁',
    'milu',
    'mbok ayu cantik salam rahayu cantik',
    '🙏🙏🙏',
    'rahayu mbok. dimogi state ngemolihan kerahayuan',
    'klo ngomong bali halus pke subtittle donk biar ngerti',
    'ikut',
    'Mbok, niki yang dados daftar sane sampun numbas buku elektronik manten nggih?🥺']},
  'https://www.tiktok.com/@lanmalajah.id/video/7425981264143928581': {'caption': 'Ngamargiang swadharma ring krama, nyarengin Kecamatan Denpasar Timur. Matur suksma, dumogi sida state mapikenoh 😇🙏🏻',
   'comments': ['🙏', 'rahayu🙏']},
  'https://www.tiktok.com/@lanmalajah.id/video/7423733793925696774': {'caption': 'Sapunapi semeton? durus komen nggih 😇  #bali  #budaya  #bahasabali  #belajar  ',
   'comments': ['swastyastu, mbok.\nampura niki, yening dados nunas tata cara sane patut nyobahay

In [207]:
current_dataset = Dataset.from_json("dataset.json")
current_dataset.data.data.update(tiktok_dataset.data.data)
current_dataset.to_json("dataset.json")