In [193]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
import time
from dotenv import load_dotenv
import sqlite3
import os
import random
from assets import HEADLESS_OPTIONS, USER_AGENTS

In [194]:
def create_table(con, cursor):
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS tiktok_posts (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                video_url TEXT UNIQUE,
                video_caption TEXT,
                author_username TEXT,
                keyword TEXT,
                hashtag TEXT)
    ''')

    cursor.execute('''
    CREATE TABLE IF NOT EXISTS authors (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                username TEXT UNIQUE,
                follower_count REAL,
                following_count REAL,
                like_count REAL)
    ''')
    con.commit()

In [195]:
def scroll_to_bottom(driver):
    prev_height = driver.execute_script("return document.body.scrollHeight;")
    count = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.uniform(2.1, 2.8))
        new_height = driver.execute_script("return document.body.scrollHeight;")
        if new_height == prev_height:
            count += 1
            if count == 3:
                break
        prev_height = new_height

In [196]:
def detect_captcha_and_wait(driver):
    try:
        captcha = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="TUXModal captcha-verify-container"]'))
        )
        if captcha:
            print("Captcha found")
            time.sleep(30)
    except:
        pass

In [197]:
def scrape_keywords(keywords, driver, con, cursor):
    print("Scraping Keywords")
    search_placeholder = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Search"]'))
    )
    search_placeholder.click()
    for keyword in keywords:
        time.sleep(5)
        try:
            clear_button = driver.find_element(By.XPATH, '//div[@data-e2e="reset-search-form"]')
            clear_button.click()
        except:
            pass
        search_placeholder.send_keys(keyword)
        search_placeholder.send_keys(Keys.RETURN)
        time.sleep(15)

        detect_captcha_and_wait(driver)

        video_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@aria-controls="tabs-0-panel-search_video"]'))
        )
        video_button.click()

        scroll_to_bottom(driver)

        video_elements = driver.find_elements(By.XPATH, '//div[@class="css-1soki6-DivItemContainerForSearch e19c29qe19"]')
        for video_element in video_elements:
            video_url = video_element.find_element(By.XPATH, './/a[contains(@class, "AVideoContainer")]').get_attribute('href')
            is_existing_video = cursor.execute('SELECT * FROM tiktok_posts WHERE video_url = ?', (video_url,)).fetchone()
            if is_existing_video:
                continue
            try:
                video_caption = video_element.find_element(By.XPATH, './/h1/span[1]').text
            except:
                video_caption = None
            try:
                author_username = video_element.find_element(By.XPATH, './/div/a[starts-with(@href, "/@")]').text
            except:
                author_username = None
            is_existing_author = cursor.execute('SELECT * FROM authors WHERE username = ?', (author_username,)).fetchone()
            try:
                if not is_existing_author:
                    cursor.execute('INSERT INTO authors (username) VALUES (?)', (author_username,))
                    # con.commit()
                cursor.execute('INSERT INTO tiktok_posts (video_url, video_caption, author_username, keyword) VALUES (?, ?, ?, ?)', (video_url, video_caption, author_username, keyword))
            except:
                print("Unable to insert", "author_username", author_username, "video_url", video_url)
                pass
    con.commit()
        # break

In [198]:
def scrape_hashtags(hashtags, main_url, driver, con, cursor):
    print("Scraping hashtags")
    for hashtag in hashtags:
        driver.get(main_url + f'tag/{hashtag.replace("#", "")}')
        time.sleep(5)
        detect_captcha_and_wait(driver)

        scroll_to_bottom(driver)

        video_elements = driver.find_elements(By.XPATH, '//div[@class="css-x6y88p-DivItemContainerV2 e19c29qe17"]')
        for video_element in video_elements:
            video_url = video_element.find_element(By.XPATH, './/a[contains(@class, "AVideoContainer")]').get_attribute('href')
            is_existing_video = cursor.execute('SELECT * FROM tiktok_posts WHERE video_url = ?', (video_url,)).fetchone()
            if is_existing_video:
                continue
            try:
                video_caption = video_element.find_element(By.XPATH, './/h1/span[1]').text
            except:
                video_caption = None
            try:
                author_username = video_element.find_element(By.XPATH, './/p[contains(@class, "user-name")]').text
            except:
                author_username = None
            is_existing_author = cursor.execute('SELECT * FROM authors WHERE username = ?', (author_username,)).fetchone()
            try:
                if not is_existing_author:
                    cursor.execute('INSERT INTO authors (username) VALUES (?)', (author_username,))
                    # con.commit()
                cursor.execute('INSERT INTO tiktok_posts (video_url, video_caption, author_username, hashtag) VALUES (?, ?, ?, ?)', (video_url, video_caption, author_username, hashtag))
            except:
                print("Unable to insert", "author_username", author_username, "video_url", video_url)
                pass
    con.commit()
        # break

In [199]:
def scrape_authors(main_url, driver, con, cursor):
    print("Scraping authors")
    authors = cursor.execute('SELECT username FROM authors ORDER BY username').fetchall()
    authors = [author[0] for author in authors]
    # print(authors)
    for author in authors:
        driver.get(main_url + f'@{author}')
        time.sleep(2)
        try:
            following_count = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, '//strong[@title="Following"]'))
            ).text
            # following_count = driver.find_element(By.XPATH, '//strong[@title="Following"]').text
            if 'M' in following_count:
                following_count = float(following_count.replace('M', '')) * 1000000
            elif 'K' in following_count:
                following_count = float(following_count.replace('K', '')) * 1000
            else:
                following_count = float(following_count)

        except Exception as e:
            print(e)
            following_count = None
        try:
            follower_count = driver.find_element(By.XPATH, '//strong[@title="Followers"]').text
            if 'M' in follower_count:
                follower_count = float(follower_count.replace('M', '')) * 1000000
            elif 'K' in follower_count:
                follower_count = float(follower_count.replace('K', '')) * 1000
            else:
                follower_count = float(follower_count)
        except:
            follower_count = None
        try:
            like_count = driver.find_element(By.XPATH, '//strong[@title="Likes"]').text
            if 'M' in like_count:
                like_count = float(like_count.replace('M', '')) * 1000000
            elif 'K' in like_count:
                like_count = float(like_count.replace('K', '')) * 1000
            else:
                like_count = float(like_count)
        except:
            like_count = None

        try:
            cursor.execute('UPDATE authors SET following_count = ?, follower_count = ?, like_count = ? WHERE username = ?', (following_count, follower_count, like_count, author))
            # con.commit()
        except:
            print("Unable to update", "author", author)
            pass
        # break
    con.commit()
        # break

In [200]:
def like_comment(driver):

    video_urls = [
        'https://www.tiktok.com/@travelwith_sagor/video/7418896636786658567?q=beautiful%20destinations&t=1728116286598',
        'https://www.tiktok.com/@misschocolate67/video/7333254822818270466?q=beautiful%20destinations&t=1728116286598'
    ]
    login_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//button[@id="header-login-button"]'))
    )
    login_button.click()
    login_with_phone_email_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//div[text()="Use phone / email / username"]'))
    )
    login_with_phone_email_button.click()
    login_with_email_button = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//a[@href="/login/phone-or-email/email"]'))
    )
    login_with_email_button.click()
    username_input = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//input[@placeholder="Email or username"]'))
    )
    username_input.send_keys(os.getenv('TIKTOK_USERNAME'))
    password_input = driver.find_element(By.XPATH, '//input[@placeholder="Password"]')
    password_input.send_keys(os.getenv('TIKTOK_PASSWORD'))

    login_button = driver.find_element(By.XPATH, '//button[@data-e2e="login-button"]')
    login_button.click()

    time.sleep(10)
    for video_url in video_urls:
        driver.get(video_url)
        time.sleep(5)
        like_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//button[contains(@aria-label, "Likes")]'))
        )
        like_button.click()
        time.sleep(5)


In [201]:
def find_top_influencers(con):
    top_influencers_df = pd.read_sql_query('SELECT * FROM authors WHERE follower_count >= 100000 AND like_count > 1000000', con)
    top_influencers_df.to_csv('top_influencers.csv', index=False)

In [202]:
def main():
    load_dotenv()
    # Initialize keywords and hashtags
    keywords = [
        "beautiful destinations",
        "places to visit",
        "places to travel",
        "places that don't feel real",
        "travel hacks"
    ]

    hashtags = [
        "#traveltok",
        "#wanderlust",
        "#backpackingadventures",
        "#luxurytravel",
        "#hiddengems",
        "#solotravel",
        "#roadtripvibes",
        "#travelhacks",
        "#foodietravel",
        "#sustainabletravel"
    ]
    # Create database connection
    con = sqlite3.connect("data.db")
    cursor = con.cursor()
    create_table(con, cursor)

    # Initialize the WebDriver
    service = Service(ChromeDriverManager().install())
    options = Options()
    for option in HEADLESS_OPTIONS:
        options.add_argument(option)
    options.add_argument(f'user-agent={random.choice(USER_AGENTS)}')

    driver = webdriver.Chrome(service=service, options=options)
    driver.maximize_window()

    main_url = 'https://www.tiktok.com/'
    driver.get(main_url)
    time.sleep(5)

    # scrape_keywords(keywords, driver, con, cursor)

    # scrape_hashtags(hashtags, main_url, driver, con, cursor)

    scrape_authors(main_url, driver, con, cursor)

    # like_comment(driver)

    find_top_influencers(con)

    # driver.quit()



In [203]:
main()

Scraping authors
Message: 
Stacktrace:
0   chromedriver                        0x000000010ceacd18 chromedriver + 4996376
1   chromedriver                        0x000000010cea45da chromedriver + 4961754
2   chromedriver                        0x000000010ca47d10 chromedriver + 388368
3   chromedriver                        0x000000010ca9430f chromedriver + 701199
4   chromedriver                        0x000000010ca943f1 chromedriver + 701425
5   chromedriver                        0x000000010cad9464 chromedriver + 984164
6   chromedriver                        0x000000010cab89dd chromedriver + 850397
7   chromedriver                        0x000000010cad6a00 chromedriver + 973312
8   chromedriver                        0x000000010cab8753 chromedriver + 849747
9   chromedriver                        0x000000010ca87635 chromedriver + 648757
10  chromedriver                        0x000000010ca87e5e chromedriver + 650846
11  chromedriver                        0x000000010ce73000 chromedri