In [6]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests

In [7]:
import os
import random
import time

from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import (
    MoveTargetOutOfBoundsException,
    TimeoutException,
    WebDriverException,
)

import chromedriver_binary

In [3]:
os.makedirs('kmdb/', exist_ok=True)

In [4]:
def open_browser():
    """
    Opens a new automated browser window with all tell-tales of automated browser disabled
    """
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    
    # remove all signs of this being an automated browser
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    # open the browser with the new options
    driver = webdriver.Chrome(options=options)
    return driver

In [5]:
driver = open_browser()

In [6]:
# setting up the url as one allowing to enter straight onto the politcs forum of the website
url = 'https://www.kmdb.or.kr/main'
driver.get(url)

In [7]:
magnifier = driver.find_element(By.XPATH,
                               './/button[@id="search-btn-m"]')
magnifier.click()

In [8]:
search_box = driver.find_element(
    By.XPATH, 
    './/input[@id="headerDesktopInput"]'
)

In [9]:
search_term = '조선족'
search_box.send_keys(search_term)

In [10]:
def press_enter(driver):
    """
    Sends the ENTER to a webdriver instance.
    """
    actions = ActionChains(driver)
    actions.send_keys(Keys.ENTER)
    actions.perform()

In [11]:
press_enter(driver)

In [12]:
show_more = driver.find_element(By.XPATH, 
                               './/a[@class="iMore1"]')
show_more.click()

In [13]:
page_source = driver.page_source

# Save the page source to a file
with open('kmdb/page_source.html', 'w', encoding='utf-8') as file:
    file.write(page_source)

In [14]:
def get_results_on_page(driver, fn_out):
    """
    Scrolls to load all listings and then saves them to `fn_out`.
    If you found a better approach, replace this function
    """
    N = 0
    while True:
        # get all the listings, and scroll to the last one, then wait two seconds.
        posts = driver.find_elements(By.XPATH, './/li[@class="detail-box"]')
        last_post = posts[-1]

        # you can use selenium to issue JavaScript commands:
        driver.execute_script("arguments[0].scrollIntoView();", last_post)
        N_posts = len(posts)
        if N_posts == N:
            break
        N = N_posts
        time.sleep(2)
        
    # how to save what the emulator sees
    with open(fn_out, 'w') as f:
        f.write(driver.page_source)

In [18]:
    fn_out = 'kmdb/kmbd_choshun_page_1.html'
    get_results_on_page(driver, fn_out)
    next_page = driver.find_element(By.XPATH, ".//a[@href = \"javascript:goPage('10');\"]")
    next_page.click()
    fn_out = 'kmdb/kmbd_choshun_page_2.html'
    get_results_on_page(driver, fn_out)
    next_page = driver.find_element(By.XPATH, ".//a[@href = \"javascript:goPage('20');\"]")
    next_page.click()
    fn_out = 'kmdb/kmbd_choshun_page_3.html'
    get_results_on_page(driver, fn_out)
    next_page = driver.find_element(By.XPATH, ".//a[@href = \"javascript:goPage('30');\"]")
    next_page.click()
    fn_out = 'kmdb/kmbd_choshun_page_4.html'
    get_results_on_page(driver, fn_out)
    next_page = driver.find_element(By.XPATH, ".//a[@href = \"javascript:goPage('40');\"]")
    next_page.click()
    fn_out = 'kmdb/kmbd_choshun_page_5.html'
    get_results_on_page(driver, fn_out)
    next_page_button = driver.find_element(By.XPATH,
                                              './/a[@class="btn next"]')
    next_page_button.click()
    fn_out = 'kmdb/kmbd_choshun_page_6.html'
    get_results_on_page(driver, fn_out)

In [19]:
driver.close()

### To parse and extract data from multiple HTML files using BeautifulSoup, you can follow these steps:

1. Read the HTML files: Read the content of each HTML file and store it as a string.

2. Create BeautifulSoup objects: Convert each HTML string into a BeautifulSoup object.

3. Extract data: Use BeautifulSoup methods to navigate and extract data from each BeautifulSoup object.

Here's a Python code example to demonstrate this process:

```python
from bs4 import BeautifulSoup

# Assuming you have a list of file paths for the HTML files
html_files = [
    'file1.html',
    'file2.html',
    'file3.html',
    # Add more file paths here as needed
]

def extract_data_from_html(html_content):
    # Create a BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract and process data from the BeautifulSoup object
    # Modify this section based on the structure of your HTML files
    # For example:
    # data = soup.find('div', class_='content').text
    # return data

# Loop through each HTML file and process its content
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        data = extract_data_from_html(html_content)
        # Do something with the extracted data, e.g., store it in a list or database
```

In the above code, we first define a function `extract_data_from_html` to extract the required data from each BeautifulSoup object. You will need to modify this function based on the specific structure of your HTML files.

Then, we loop through each HTML file, read its content, create a BeautifulSoup object for each file, and call the `extract_data_from_html` function to extract the data.

Please note that the exact parsing and data extraction will depend on the structure of your HTML files and the specific data you want to extract. Modify the `extract_data_from_html` function according to the HTML structure and data you are interested in.

In [22]:
import os
from bs4 import BeautifulSoup

# Folder path containing the HTML files
folder_path = 'kmdb'

# Assuming you have a list of file paths for the HTML files
html_files = []
for p in range(1,7):
    html = f'kmbd_choshun_page_{p}.html'
    html_files.append(html)

def extract_data_from_html(html_content):
    # Create a BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    big_box = soup.select("ul")
    detail_boxes = big_box.select("li")
    box1 = detail_boxes.select(".details")
    for b in box1:
        return
        eng_title = b.select("a div.movie-tt-eng searchFont").text
        genre = b.select("div a span").text
        etc = b.select("div span").text
        
# Get a list of file names in the 'kmdb' folder
html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]

# Loop through each HTML file and process its content
for file_path in html_files:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        data = extract_data_from_html(html_content)
        # Do something with the extracted data, e.g., store it in a list or database


FileNotFoundError: [Errno 2] No such file or directory: 'kmbd_choshun_page_4.html'

### Yes, there is an issue with your `extract_data_from_html` function. The problem is that you are returning immediately after entering the `for` loop, so the function will terminate without executing any code after the `return` statement. As a result, nothing will be returned from the function.

To fix this, you need to remove the `return` statement that appears at the beginning of the `for` loop. Instead, you should create a data structure (e.g., a list or dictionary) to store the extracted data from each iteration of the loop and return that data structure after the loop completes.

Here's the updated `extract_data_from_html` function:

```python
def extract_data_from_html(html_content):
    # Create a BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    big_box = soup.select("ul")
    detail_boxes = big_box.select("li")
    
    # Create a list to store the extracted data from each detail box
    extracted_data = []
    
    for b in detail_boxes:
        # Extract data from each detail box
        eng_title = b.select("a div.movie-tt-eng.searchFont").text.strip()
        genre = b.select("div a span").text.strip()
        etc = b.select("div span").text.strip()
        
        # Create a dictionary to store the extracted data from this detail box
        data_dict = {
            'eng_title': eng_title,
            'genre': genre,
            'etc': etc
        }
        
        # Append the data_dict to the extracted_data list
        extracted_data.append(data_dict)
    
    # Return the list containing all the extracted data from the HTML
    return extracted_data
```

With this updated function, it will now properly extract and store the data from each detail box in the `extracted_data` list. The `extracted_data` list will be returned as the result of the function, and you can use it to store or process the data as needed in the loop that calls this function.

In [38]:
import os
from bs4 import BeautifulSoup

def extract_data_from_html(html_content):
    # Create a BeautifulSoup object
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all the li elements with class "detail-box"
    detail_boxes = soup.select("li.detail-box")
    
    # Create a list to store the extracted data from each detail box
    extracted_data = []
    
    # Loop through each li element (detail box)
    for box in detail_boxes:
        eng_title = box.select_one("a div.movie-tt-eng").text.strip()
        genre_elements = box.select("div.national span")
        genre = genre_elements[0].text.strip() if genre_elements else ""
        year_elements = box.select("div.year span")
        year = year_elements[1].text.strip() if year_elements else ""
        duration_element = box.select_one("div.year span:last-of-type")
        duration = duration_element.text.strip() if duration_element else "" 
        director_element = box.select_one("div.director a")
        director = director_element.text.strip() if director_element else ""
        yd = box.select("div.year span")
        year2 = yd[0].text.strip()
        duration2 = yd[1].text.strip()



    # Create a dictionary to store the extracted data from this detail box
        data_dict = {
            'eng_title': eng_title,
            'genre': genre,
            'year': year,
            'duration': duration,
        }
        
        # Append the data_dict to the extracted_data list
        extracted_data.append(data_dict)
    
    # Return the list containing all the extracted data from the HTML
    return extracted_data


# Folder path containing the HTML files
folder_path = os.path.abspath('kmdb')

# Get a list of file names in the 'kmdb' folder
html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]

# Loop through each HTML file in the 'kmdb' folder and process its content
for file_name in html_files:
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        data = extract_data_from_html(html_content)
        # Do something with the extracted data, e.g., store it in a list or database


1. The `select` and `select_one` methods in BeautifulSoup are used to find elements that match a specified CSS selector. `select` returns a list of all elements that match the selector, while `select_one` returns only the first element that matches the selector. These methods are helpful when dealing with complex HTML structures where you need to find specific elements based on their class names, tags, or other attributes.

2. In the provided HTML structure, there are multiple `<span>` elements with class "national" and "country". For example, `<span>다큐멘터리</span>` and `<span>대한민국</span>` are two separate `<span>` elements with class "national". Similarly, there are other elements with the same class name. When using `box.select("div.national span")`, it returns a list of all the `<span>` elements that match the selector. To extract the text content of these elements, we use list indexing like `[0]` to access the first element, which corresponds to the genre, and `[1]` to access the second element, which corresponds to the country. The list indexing allows us to target specific elements from the list of matched elements.

3. The `if genre_elements else ""` is a ternary operator in Python. It is used to provide a default value when `genre_elements` is an empty list. If `genre_elements` is not empty, it will take the value of `genre_elements[0].text.strip()`, which represents the text content of the first element in the list. If `genre_elements` is an empty list, it will take the value of an empty string `""`. This is useful to avoid errors when trying to access elements from an empty list.

4. The selector `"div.year span:nth-of-type(2)"` is used to target the second `<span>` element within the `<div>` element with class "year". The `nth-of-type(2)` is a CSS pseudo-class selector that matches the second occurrence of the specified element type within its parent. In this case, it is used to target the second `<span>` element inside the `<div>` with class "year". This second `<span>` element contains the duration information, which is extracted using `box.select_one("div.year span:nth-of-type(2)")`.

### Apologies for the confusion in my previous response. I understand now why you encountered an `IndexError`. It seems that the HTML structure you provided is different from what I assumed. You can use list indexing to access the elements, but the issue is that there are only two `<span>` elements inside the `<div class="year">`, not four as I initially mentioned.

To extract the year and duration correctly using list indexing, you should use index `0` for the year and index `1` for the duration, like this:

```python
yd = box.select("div.year span")
year = yd[0].text.strip()
duration = yd[1].text.strip()
```

The reason you encountered an `IndexError` is that you attempted to access the third and fourth elements of the `yd` list using index `3`, which don't exist in this case.

Regarding your question about `nth-of-type(2)`, it is a CSS pseudo-class that can be used to select the second element of a particular type within its parent container. For example, if you had multiple `<div class="year">` elements on the page, and you wanted to select the second one and then access its child `<span>` elements, you could use `nth-of-type(2)` as follows:

```python
second_year_div = b.select("div.year:nth-of-type(2)")
yd = second_year_div[0].select("span")
year = yd[0].text.strip()
duration = yd[1].text.strip()
```

However, based on the HTML structure you provided, it seems that using `div.year span` with list indexing should work fine to access the year and duration elements.

사실 이거 안 되고... 안 되는 이유는 `span`택이 얼핏 4개로 보여도 사실 첫번째, 세번째 태그는 `class="comma'`라서 쉼표로 생각됨. List indexing으로 접근되긴 되는데 사실 `span`이 2개 뿐인거라 [0], [1] 이렇게로만 접근할 수 있고 그러면 결과는 year:',', duration:2017년 이렇게밖에 안 나옴.. 그래서 무조건 `nth-of-type(2)` 이거 써야 함

In [39]:
data

[{'eng_title': 'Pollock (Myeong-tae)',
  'genre': '극영화',
  'year': '2017년',
  'duration': '23분'},
 {'eng_title': 'Sea Fog (Hae-mu)',
  'genre': '극영화',
  'year': '2014년',
  'duration': '110분'},
 {'eng_title': 'Night Journey ( Yahaeng )',
  'genre': '극영화',
  'year': '2014년',
  'duration': '14분'},
 {'eng_title': 'Sleeping Beauty',
  'genre': '극영화',
  'year': '2007년',
  'duration': '109분'},
 {'eng_title': 'The Abortion',
  'genre': '극영화',
  'year': '2011년',
  'duration': '32분'},
 {'eng_title': 'FAREWELL (Jak-byul-deul)',
  'genre': '극영화',
  'year': '2011년',
  'duration': '93분'},
 {'eng_title': 'Tuning Fork',
  'genre': '극영화',
  'year': '2014년',
  'duration': '105분'},
 {'eng_title': 'PHISHING (Pisingjakjeon)',
  'genre': '극영화',
  'year': '2021년',
  'duration': '20분'}]