# 패키지 설치

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()  # .env 파일에서 환경 변수를 불러옵니다.

# api_key = os.getenv("OPENAI_API_KEY")
# print(f"API Key: {api_key}")
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [2]:
# print('hello world')
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.runnables import ConfigurableField
from langchain_community.vectorstores.faiss import FAISS

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [3]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)

prompt = PromptTemplate.from_template("Question: {question}\nAnswer:")

Question = '당신은 누구십니까?'

response = llm.invoke(prompt.format(question=Question))

In [4]:
response

AIMessage(content='저는 인공지능 챗봇입니다.', response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 25, 'total_tokens': 40}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-4d465ffc-8806-4f67-bedb-2b6fa95c36bc-0', usage_metadata={'input_tokens': 25, 'output_tokens': 15, 'total_tokens': 40})

In [5]:
import requests

urls = {
    "namu_crawler.py":"https://raw.githubusercontent.com/lymanstudio/proj_artist_info_gen/main/namu_crawler.py",
    "namu_loader.py":"https://raw.githubusercontent.com/lymanstudio/proj_artist_info_gen/main/namu_loader.py"
}

for key, val in urls.items():    
    r = requests.get(val)

    # 다운로드한 파일을 현재 디렉토리에 저장
    with open(key, "w", encoding='utf-8') as file:
        file.write(r.text)


In [1]:
# downloaded_module에서 클래스를 임포트
from namu_crawler import NamuCrawler

# MyClass의 인스턴스를 생성하고 메서드를 호출
my_instance = NamuCrawler("https://namu.wiki/w/EXO-CBX", 0)
my_instance.construct_toc()
my_instance.get_doc_title()

'EXO-CBX'

In [5]:
from namu_loader import NamuLoader

my_loader = NamuLoader("https://namu.wiki/w/EXO-CBX",0)
docs = my_loader.lazy_load()

In [11]:
import os
import json
import time
from namu_loader import NamuLoader
import textwrap

from google.cloud import bigquery

import openai
from langchain_community.embeddings import OpenAIEmbeddings

In [12]:
!gcloud auth application-default login
# %pip install --upgrade google-cloud-bigquery pandas db-dtypes

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=Z8E0DTsvcp3fh75yJSskp1aMss9TEV&access_type=offline&code_challenge=YAsFxyFMWRph8AWZVu8h6BHJt2Cw3K48RhUWeZQ714w&code_challenge_method=S256


Credentials saved to file: [C:\Users\sanghyoon\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "wev-dev-analytics" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


In [13]:
# BigQuery
def load_data_to_bigquery(client, json_data, project_id, dataset_id, table_id, region, write_disposition, artist_info, page_url):
    
    # print(json_data)
    
    # metadata 는 한글이 섞여있으므로 ensure_ascii 옵션을 False 로 설정한다.
    # artist_info, page_url 은 크롤링된 정보에서 가져오는 것이 아니므로 수동으로 넣어준다.
    for item in json_data:
        item['abs_page_toc_item'] = item['metadata']['abs_page_toc_item']
        item['toc_item'] = item['metadata']['toc_item']
        item['metadata'] = json.dumps(item['metadata'], ensure_ascii=False)
        item['artist_info'] = artist_info
        item['page_url'] = page_url
    
    table_ref = client.dataset(dataset_id, project=project_id).table(table_id)
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = write_disposition
    
    load_job = client.load_table_from_json(
        json_data, table_ref, location=region, job_config=job_config
    )
    
    load_job.result()  
    print(f'Loaded {len(json_data)} rows into {project_id}:{dataset_id}.{table_id}')

In [14]:
documents = []
for doc in docs:
    documents.append({
        "page_content": doc.page_content,
        "metadata": doc.metadata
    })

# 내가 작업하고자 하는 GCP 프로젝트, region, dataset, table id 설정
PROJECT_ID = "wev-dev-analytics"
REGION = "asia-northeast3"
DATASET_ID = "namu_wiki"
TABLE_ID = "namu_string_result_03"

# 빅쿼리에 저장할 테이블의 schema 정의
client = bigquery.Client(project=PROJECT_ID)
schema = [
    bigquery.SchemaField("page_url", "STRING"),
    bigquery.SchemaField("artist_info", "STRING"),
    bigquery.SchemaField("metadata", "STRING"),
    bigquery.SchemaField("page_content", "STRING"),
    bigquery.SchemaField("toc_item", "STRING"),
    bigquery.SchemaField("abs_page_toc_item", "STRING"),
    ]

dataset_ref = client.dataset(DATASET_ID)
dataset = bigquery.Dataset(dataset_ref)
dataset.location = REGION

# 데이터셋 생성 (이미 존재하는 경우 생략)
try:
    client.create_dataset(dataset)
    print(f"Created dataset {DATASET_ID} in {REGION}")
except:
    print(f"Dataset {DATASET_ID} already exists in {REGION}")

# 테이블 생성 (이미 존재하는 경우 생략)
table_ref = dataset_ref.table(TABLE_ID)
table = bigquery.Table(table_ref, schema=schema)

try:
    client.create_table(table)
    print(f"Created table {TABLE_ID} in dataset {DATASET_ID}")
except:
    print(f"Table {TABLE_ID} already exists in dataset {DATASET_ID}")

    # 넣고 싶은 ARTIST_INFO, PAGE_URL 값을 기입해준다.
ARTIST_INFO = 'EXO-CBX'
PAGE_URL = 'https://namu.wiki/w/EXO-CBX'

    # 각 파라미터를 기입해준다. WRITE_APPEND 은 테이블에 데이터가 append 되고, WRITE_TRUNCATE 을 기입하면 overwrite 된다.
load_data_to_bigquery(client, documents, PROJECT_ID, DATASET_ID, TABLE_ID, REGION, bigquery.WriteDisposition.WRITE_APPEND, ARTIST_INFO, PAGE_URL) # WRITE_APPEND, WRITE_TRUNCATE

Dataset namu_wiki already exists in asia-northeast3
Created table namu_string_result_03 in dataset namu_wiki
Loaded 33 rows into wev-dev-analytics:namu_wiki.namu_string_result_03


In [2]:
html_snippet = """
<table class="wiki-table" data-dark-style="background-color:#1f2023; color:#fff;" data-v-76cbfa86="" style="background-color:#fff; color:#000; width:100%; border:2px solid #000;">
  <tbody data-v-76cbfa86="">
    <tr data-v-76cbfa86="" style="color:#fff;">
      <td data-v-76cbfa86="" style="background-color:#00d406; text-align:center;">
        <div class="wiki-paragraph" data-v-76cbfa86="">
          <strong data-v-76cbfa86="">날짜</strong>
        </div>
      </td>
      <td data-v-76cbfa86="" style="background-color:#01acd8; width:30%; text-align:center;">
        <div class="wiki-paragraph" data-v-76cbfa86="">
          <strong data-v-76cbfa86="">방송사</strong>
        </div>
      </td>
      <td data-v-76cbfa86="" style="background-color:#01acd8; width:30%; text-align:center;">
        <div class="wiki-paragraph" data-v-76cbfa86="">
          <strong data-v-76cbfa86="">곡명</strong>
        </div>
      </td>
      <td data-v-76cbfa86="" style="background-color:#ff090a; width:20%; text-align:center;">
        <div class="wiki-paragraph" data-v-76cbfa86="">
          <strong data-v-76cbfa86="">비고</strong>
        </div>
      </td>
    </tr>
    <tr data-v-76cbfa86="">
      <td colspan="4" data-v-76cbfa86="" rowspan="1" style="height:32px; text-align:center; vertical-align:top;">
        <div class="wiki-paragraph" data-v-76cbfa86="">
          <div data-v-76cbfa86="" style="margin:0px -10px -5px;word-break:keep-all">
            <dl class="wiki-folding" data-v-76cbfa86="">
              <dt data-v-76cbfa86=""> [ 2016년 ]</dt>
              <dd data-v-76cbfa86="">
                <div data-v-76cbfa86="" style="margin:-6px 0px -11px">
                  <div class="wiki-table-wrap" data-v-76cbfa86="" style="width:100%">
                    <table class="wiki-table" data-dark-style="background-color:#1f2023; color:#ddd;" data-v-76cbfa86="" style="background-color:#fff; color:#373a3c; width:100%;">
                      <tbody data-v-76cbfa86="">
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">10월 30일</div>
                          </td>
                          <td data-v-76cbfa86="" style="width:30%; text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">SBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/SBS%20%EC%9D%B8%EA%B8%B0%EA%B0%80%EC%9A%94" title="SBS 인기가요">SBS 인기가요</a>
                              <a class="wiki-fn-content" data-v-76cbfa86="" href="#fn-23" title="부산 원 아시아 페스티벌 특집">
                                <span data-v-76cbfa86="" id="rfn-23"></span>[23] </a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="width:30%; text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1206905" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1206905">너를 위해</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="width:20%; text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 3일</div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">M.net <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EC%97%A0%20%EC%B9%B4%EC%9A%B4%ED%8A%B8%EB%8B%A4%EC%9A%B4" title="엠 카운트다운">엠 카운트다운</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1217169" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1217169">The one</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <div data-v-76cbfa86="" style="margin:0px -10px">
                                <strong data-v-76cbfa86="">유닛 데뷔</strong>
                                <br data-v-76cbfa86="" />
                                <span class="wiki-size-down-2" data-v-76cbfa86="">
                                  <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/unbR8Y-90Fg" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/unbR8Y-90Fg">직캠_The one</a>
                                  <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/lUFqfsPTLfA" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/lUFqfsPTLfA">직캠_Hey Mama!</a>
                                  <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/uIEpUNx7tgk" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/uIEpUNx7tgk">미니팬미팅</a>
                                </span>
                              </div>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1217179" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1217179">Hey Mama!</a>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 4일</div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">KBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EB%AE%A4%EC%A7%81%EB%B1%85%ED%81%AC" title="뮤직뱅크">뮤직뱅크</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/GhXlttJ4rI4" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/GhXlttJ4rI4">The one</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/8BDqxQXTC88" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/8BDqxQXTC88">Hey Mama!</a>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 5일</div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">MBC <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EC%87%BC!%20%EC%9D%8C%EC%95%85%EC%A4%91%EC%8B%AC" title="쇼! 음악중심">쇼! 음악중심</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1221167" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1221167">The one</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <span class="wiki-size-down-2" data-v-76cbfa86="">
                                <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/lUFqfsPTLfA" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/lUFqfsPTLfA">직캠_Hey Mama!</a>
                                <a class="wiki-link-external" data-v-76cbfa86="" href="https://www.vlive.tv/video/16354" rel="nofollow noopener ugc" target="_blank" title="https://www.vlive.tv/video/16354">5분 딜레이</a>
                                <a class="wiki-fn-content" data-v-76cbfa86="" href="#fn-24" title="59분에 등장.">
                                  <span data-v-76cbfa86="" id="rfn-24"></span>[24] </a>
                              </span>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1221168" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1221168">Hey Mama!</a>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 6일</div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">SBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/SBS%20%EC%9D%B8%EA%B8%B0%EA%B0%80%EC%9A%94" title="SBS 인기가요">SBS 인기가요</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1222326" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1222326">The one</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" rowspan="2" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1222320" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1222320">Hey Mama!</a>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 11일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">KBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EB%AE%A4%EC%A7%81%EB%B1%85%ED%81%AC" title="뮤직뱅크">뮤직뱅크</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/z6DxhRpuTBs" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/z6DxhRpuTBs">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 12일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">MBC <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EC%87%BC!%20%EC%9D%8C%EC%95%85%EC%A4%91%EC%8B%AC" title="쇼! 음악중심">쇼! 음악중심</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1235696" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1235696">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <span class="wiki-size-down-2" data-v-76cbfa86="">
                                <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/fYLx7ds16ec" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/fYLx7ds16ec">직캠</a>
                                <a class="wiki-link-external" data-v-76cbfa86="" href="https://www.vlive.tv/video/16720" rel="nofollow noopener ugc" target="_blank" title="https://www.vlive.tv/video/16720">5분 딜레이</a>
                                <a class="wiki-fn-content" data-v-76cbfa86="" href="#fn-25" title="1시간 5분 40초에 등장.">
                                  <span data-v-76cbfa86="" id="rfn-25"></span>[25] </a>
                              </span>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 13일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">SBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/SBS%20%EC%9D%B8%EA%B8%B0%EA%B0%80%EC%9A%94" title="SBS 인기가요">SBS 인기가요</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1236728" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1236728">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 15일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">SBS MTV <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EB%8D%94%20%EC%87%BC" title="더 쇼">더 쇼</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1242106" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1242106">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <strong data-v-76cbfa86="">1위</strong>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 18일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">KBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EB%AE%A4%EC%A7%81%EB%B1%85%ED%81%AC" title="뮤직뱅크">뮤직뱅크</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/VyqbhgVI4uc" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/VyqbhgVI4uc">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">11월 20일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">SBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/SBS%20%EC%9D%B8%EA%B8%B0%EA%B0%80%EC%9A%94" title="SBS 인기가요">SBS 인기가요</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1252548" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1252548">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">12월 24일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">MBC <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EC%87%BC!%20%EC%9D%8C%EC%95%85%EC%A4%91%EC%8B%AC" title="쇼! 음악중심">쇼! 음악중심</a>
                              <a class="wiki-fn-content" data-v-76cbfa86="" href="#fn-26" title="크리스마스 특집">
                                <span data-v-76cbfa86="" id="rfn-26"></span>[26] </a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1329319" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1329319">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">12월 29일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">KBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/KBS%20%EA%B0%80%EC%9A%94%EB%8C%80%EC%B6%95%EC%A0%9C" title="KBS 가요대축제">가요대축제</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/1340896" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/1340896">Hey Mama!</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                      </tbody>
                    </table>
                  </div>
                </div>
              </dd>
            </dl>
          </div>
        </div>
      </td>
    </tr>
    <tr data-v-76cbfa86="">
      <td colspan="4" data-v-76cbfa86="" rowspan="1" style="height:32px; text-align:center; vertical-align:top;">
        <div class="wiki-paragraph" data-v-76cbfa86="">
          <div data-v-76cbfa86="" style="margin:0px -10px -5px;word-break:keep-all">
            <dl class="wiki-folding" data-v-76cbfa86="">
              <dt data-v-76cbfa86=""> [ 2018년 ]</dt>
              <dd data-v-76cbfa86="">
                <div data-v-76cbfa86="" style="margin:-6px 0px -11px">
                  <div class="wiki-table-wrap" data-v-76cbfa86="" style="width:100%">
                    <table class="wiki-table" data-dark-style="background-color:#1f2023; color:#ddd;" data-v-76cbfa86="" style="background-color:#fff; color:#373a3c; width:100%;">
                      <tbody data-v-76cbfa86="">
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">4월 12일</div>
                          </td>
                          <td data-v-76cbfa86="" style="width:30%; text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">M.net <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EC%97%A0%20%EC%B9%B4%EC%9A%B4%ED%8A%B8%EB%8B%A4%EC%9A%B4" title="엠 카운트다운">엠 카운트다운</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="width:30%; text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/3030347" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/3030347">花요일 (Blooming Day)</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="width:20%; text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <strong data-v-76cbfa86="">컴백</strong>
                              <br data-v-76cbfa86="" />
                              <span class="wiki-size-down-2" data-v-76cbfa86="">
                                <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/sD4qIdQwfxs" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/sD4qIdQwfxs">직캠</a>
                                <div data-v-76cbfa86="" style="margin:0px -10px">
                                  <dl class="wiki-folding" data-v-76cbfa86="">
                                    <dt data-v-76cbfa86=""> [개인 직캠]</dt>
                                    <dd data-v-76cbfa86="">
                                      <span class="wiki-size-down-3" data-v-76cbfa86="">
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/CphibUNqUXs" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/CphibUNqUXs">시우민</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/aRS_D2UNKeI" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/aRS_D2UNKeI">백현</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/WxhAX0dtMXw" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/WxhAX0dtMXw">첸</a>
                                      </span>
                                    </dd>
                                  </dl>
                                </div>
                              </span>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">4월 13일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">KBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EB%AE%A4%EC%A7%81%EB%B1%85%ED%81%AC" title="뮤직뱅크">뮤직뱅크</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/BUjnqktA3tY" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/BUjnqktA3tY">花요일 (Blooming Day)</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <div data-v-76cbfa86="" style="margin:0px -10px">
                                <span class="wiki-size-down-2" data-v-76cbfa86="">
                                  <dl class="wiki-folding" data-v-76cbfa86="">
                                    <dt data-v-76cbfa86=""> [개인 직캠]</dt>
                                    <dd data-v-76cbfa86="">
                                      <span class="wiki-size-down-3" data-v-76cbfa86="">
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/3wNLHyseg24" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/3wNLHyseg24">시우민</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/Vs9BqzAlEHo" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/Vs9BqzAlEHo">백현</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/yyjV5H9-Cvo" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/yyjV5H9-Cvo">첸</a>
                                      </span>
                                    </dd>
                                  </dl>
                                </span>
                              </div>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">4월 14일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">MBC <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EC%87%BC!%20%EC%9D%8C%EC%95%85%EC%A4%91%EC%8B%AC" title="쇼! 음악중심">쇼! 음악중심</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/3040418" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/3040418">花요일 (Blooming Day)</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <span class="wiki-size-down-2" data-v-76cbfa86="">
                                <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/Vnd6aKsn2CQ" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/Vnd6aKsn2CQ">직캠</a>
                                <div data-v-76cbfa86="" style="margin:0px -10px">
                                  <dl class="wiki-folding" data-v-76cbfa86="">
                                    <dt data-v-76cbfa86=""> [개인 직캠]</dt>
                                    <dd data-v-76cbfa86="">
                                      <span class="wiki-size-down-3" data-v-76cbfa86="">
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/aK45XM7v8v8" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/aK45XM7v8v8">시우민</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/7b50BR9jUfU" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/7b50BR9jUfU">백현</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/ZMYz9WWyoLk" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/ZMYz9WWyoLk">첸</a>
                                      </span>
                                    </dd>
                                  </dl>
                                </div>
                              </span>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">4월 15일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">SBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/SBS%20%EC%9D%B8%EA%B8%B0%EA%B0%80%EC%9A%94" title="SBS 인기가요">SBS 인기가요</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/3043447" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/3043447">花요일 (Blooming Day)</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">4월 20일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">KBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EB%AE%A4%EC%A7%81%EB%B1%85%ED%81%AC" title="뮤직뱅크">뮤직뱅크</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/sU7xWdnHmFw" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/sU7xWdnHmFw">花요일 (Blooming Day)</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">4월 21일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">MBC <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/%EC%87%BC!%20%EC%9D%8C%EC%95%85%EC%A4%91%EC%8B%AC" title="쇼! 음악중심">쇼! 음악중심</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/3080769" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/3080769">花요일 (Blooming Day)</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <span class="wiki-size-down-2" data-v-76cbfa86="">
                                <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/wlQks_A6qFo" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/wlQks_A6qFo">직캠</a>
                                <div data-v-76cbfa86="" style="margin:0px -10px">
                                  <dl class="wiki-folding" data-v-76cbfa86="">
                                    <dt data-v-76cbfa86=""> [개인 직캠]</dt>
                                    <dd data-v-76cbfa86="">
                                      <span class="wiki-size-down-3" data-v-76cbfa86="">
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/jRIfjXToNgo" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/jRIfjXToNgo">시우민</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/wX_89NZRA5w" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/wX_89NZRA5w">백현</a>
                                        <a class="wiki-link-external" data-v-76cbfa86="" href="https://youtu.be/LxFrfHGGG6k" rel="nofollow noopener ugc" target="_blank" title="https://youtu.be/LxFrfHGGG6k">첸</a>
                                      </span>
                                    </dd>
                                  </dl>
                                </div>
                              </span>
                            </div>
                          </td>
                        </tr>
                        <tr data-v-76cbfa86="">
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">4월 22일</div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">SBS <a class="wiki-link-internal" data-v-76cbfa86="" href="/w/SBS%20%EC%9D%B8%EA%B8%B0%EA%B0%80%EC%9A%94" title="SBS 인기가요">SBS 인기가요</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:center;">
                            <div class="wiki-paragraph" data-v-76cbfa86="">
                              <a class="wiki-link-external" data-v-76cbfa86="" href="https://tv.naver.com/v/3083946" rel="nofollow noopener ugc" target="_blank" title="https://tv.naver.com/v/3083946">花요일 (Blooming Day)</a>
                            </div>
                          </td>
                          <td data-v-76cbfa86="" style="text-align:right;">
                            <div class="wiki-paragraph" data-v-76cbfa86=""></div>
                          </td>
                        </tr>
                      </tbody>
                    </table>
                  </div>
                </div>
              </dd>
            </dl>
          </div>
        </div>
      </td>
    </tr>
  </tbody>
</table>
"""

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_snippet, 'html.parser')

my_instance.transform_nested_table(soup)


<table class="wiki-table" data-dark-style="background-color:#1f2023; color:#fff;" data-v-76cbfa86="" style="background-color:#fff; color:#000; width:100%; border:2px solid #000;">
<tbody data-v-76cbfa86="">
<tr data-v-76cbfa86="" style="color:#fff;">
<td data-v-76cbfa86="" style="background-color:#00d406; text-align:center;">
<div class="wiki-paragraph" data-v-76cbfa86="">
<strong data-v-76cbfa86="">날짜</strong>
</div>
</td>
<td data-v-76cbfa86="" style="background-color:#01acd8; width:30%; text-align:center;">
<div class="wiki-paragraph" data-v-76cbfa86="">
<strong data-v-76cbfa86="">방송사</strong>
</div>
</td>
<td data-v-76cbfa86="" style="background-color:#01acd8; width:30%; text-align:center;">
<div class="wiki-paragraph" data-v-76cbfa86="">
<strong data-v-76cbfa86="">곡명</strong>
</div>
</td>
<td data-v-76cbfa86="" style="background-color:#ff090a; width:20%; text-align:center;">
<div class="wiki-paragraph" data-v-76cbfa86="">
<strong data-v-76cbfa86="">비고</strong>
</div>
</td>
</tr>


In [None]:
#TODO
1. 전략: 불필요한 태그 삭제 후 텍스트 추출

tr과 td의 조합 > td가 중첩이 심함.
tr이 중첩된 경우된 경우
    첫 번째 tr > row
    그 뒤 tr > json 중첩 시킴
td
    첫 번째 td > key
    그 뒤 td > values

2. dt 하위 태그 삭제
3. href="#fn" 삭제
4. strong을 잘 쓰면 될 것 같은데...
99. colspan, rowspan을 어떻게 잘 쓸 수 있을지?

In [None]:
#목표는 JSON으로 파싱
#컬럼명은 최대한 보존하는 것으로
{
    "table_name": "profile_table",
    "columns": ["결성일", "데뷔일", "데뷔 음반", "장르", "캡틴", "소속사", "레이블", "유통사", "팬덤", "응원봉", "공식사이트", "링크"],
    "data": [
        {
            "결성일": "2017년 9월 29일",
            "데뷔일": "2018년 1월 24일",
            "데뷔음반": "To. Heart",
            "장르": "K-POP, 댄스, 발라드, 어쿠스틱",
            "캡탠":"이새롬",
            "소속사":"플레디스엔터테인먼트",
            "레이블":"HYBE LABELS,YG PLUS,게펜 레코드",
            "유통사":"YG PLUS, 유니버셜 뮤직 재팬, 유니버셜 뮤직그룹",
            "팬덤":"flover(플로버)",
            "응원봉":"플로봉",
            "공식사이트":"한국,일본",
            "링크":"https://~~~"
        }
    ]
}

In [89]:
def simplify_html(html_content):
    # 파서에 HTML 내용 전달
    soup = BeautifulSoup(html_content, 'html.parser')

    # 불필요한 div 태그 제거
    for div in soup.find_all('div'):
        # 디자인 목적의 div만 제거 (단, 필요 시 보존)
        if div.get('style') and not div.get('class'):
            div.unwrap()

    # 중첩된 span 태그도 제거
    for span in soup.find_all('span'):
        span.unwrap()
    
    # 중첩된 img 태그도 제거
    for span in soup.find_all('img'):
        span.unwrap()
        
    # 제거하고자 하는 텍스트
    stopwords = ["행정구","속령"]

    # 특정 텍스트가 포함된 모든 태그 찾기
    for word in stopwords:
        tags_to_remove = soup.find_all(string=lambda text: word in text)

        # 찾은 태그들 제거
        for tag in tags_to_remove:
            tag.parent.decompose()
    
    return str(soup).replace('\n', '')

html_snippet = simplify_html(html_origin)

In [40]:
from html2image import Html2Image
hti = Html2Image()

# HTML string containing only the part you want to convert
# html_snippet = profile_table

# Convert the HTML snippet to an image
hti.screenshot(html_str=html_snippet, save_as='specific_part.png')

['c:\\Users\\sanghyoon\\Documents\\Github\\idol_info_finder\\idol_info_finder-2\\specific_part.png']

In [24]:
from bs4 import BeautifulSoup

# BeautifulSoup 객체 생성
soup = BeautifulSoup(html_snippet, 'html.parser')

# 최상위 <table> 태그 가져오기
outer_table = soup.find("table")

# <dl> 내부에 중첩된 <table>을 처리
for dl in outer_table.find_all("dl"):
    nested_table = dl.find("table")  # <dl> 내부의 <table> 찾기
    if nested_table:
        # 중첩된 <table>의 <tr> 태그를 추출
        nested_rows = nested_table.find_all("tr")
        parent_tr = dl.find_parent("tr")  # <dl>의 부모 <td> 찾기
        
        # <tr> 태그를 부모 <td>에 추가
        for tr in nested_rows:
            parent_tr.append(tr)
        
        # 중첩된 <table> 태그를 제거
        nested_table.decompose()

# 변환된 HTML 출력
print(soup.prettify())

<table class="wiki-table" data-dark-style="background-color:#1f2023; color:#fff;" data-v-76cbfa86="" style="background-color:#fff; color:#000; width:100%; border:2px solid #000;">
 <tbody data-v-76cbfa86="">
  <tr data-v-76cbfa86="" style="color:#fff;">
   <td data-v-76cbfa86="" style="background-color:#00d406; text-align:center;">
    <div class="wiki-paragraph" data-v-76cbfa86="">
     <strong data-v-76cbfa86="">
      날짜
     </strong>
    </div>
   </td>
   <td data-v-76cbfa86="" style="background-color:#01acd8; width:30%; text-align:center;">
    <div class="wiki-paragraph" data-v-76cbfa86="">
     <strong data-v-76cbfa86="">
      방송사
     </strong>
    </div>
   </td>
   <td data-v-76cbfa86="" style="background-color:#01acd8; width:30%; text-align:center;">
    <div class="wiki-paragraph" data-v-76cbfa86="">
     <strong data-v-76cbfa86="">
      곡명
     </strong>
    </div>
   </td>
   <td data-v-76cbfa86="" style="background-color:#ff090a; width:20%; text-align:center;">
    <

In [2]:
from namu_crawler import NamuCrawler

# MyClass의 인스턴스를 생성하고 메서드를 호출
my_instance = NamuCrawler("https://namu.wiki/w/EXO-CBX", 0)
my_instance.construct_toc()
my_instance.get_doc_title()

'EXO-CBX'

In [None]:
#프나 기준 유효 테이블 idx 
#(개요)
#17: 프로필
#20: 멤버
#23: 역대 로고
#30: 음반
#31: 뮤비 성적
#(음반)

In [7]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [8]:
import os

#API KEY 셋팅
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
    
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [9]:
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain-academy"

In [167]:
append_row = []
for row in t1:
    append_row.append(",".join(col for col in row))
str_table = "\n\n ".join(row for row in append_row)

In [168]:
str_table

'SMTOWN WEEKf(x) + EXO : Christmas Wonderland\n\n \n\n 일시,장소\n\n 2013년 12월 24일,킨텍스\xa0제1전시장 5홀\n\n 2013년 12월 25일,킨텍스\xa0제1전시장 5홀\n\n [ 세트리스트 펼치기 ]순서곡명(원곡자)[1]참여 가수비고오프닝 VCR4Let Out The BeastEXO5Black PearlEXO멘트6HISTORYEXO10피터팬EXO-K113.6.5EXO-M12LuckyEXOVCR13Have Yourself A Merry Little Christmas(Judy Garland)첸, 루나14첫 눈EXO15Christmas DayEXO17Baby, Don`t Cry수호, 백현, 찬열, 디오18Thrift Shop(Macklemore & Ryan Lewis)크리스, 엠버19Dance Break카이, 세훈, 레이, 빅토리아, 크리스탈2012월의 기적백현, 첸, 디오, 루한21Goodbye Summer디오, 크리스탈, 루나22캔디 + 행복(H.O.T.)EXO-M + EXO-KSMTOWN VCR25늑대와 미녀EXO[2]26MAMAEXO멘트27마법의 성(더 클래식)EXO, f(x)앵콜29으르렁EXO[3]30Jingle Bell Rock(Bobby Helms)EXO, f(x)엔딩엔딩 VCR,순서,곡명(원곡자)[1],참여 가수,비고,오프닝 VCR,4,Let Out The Beast,EXO,,5,Black Pearl,EXO,,멘트,6,HISTORY,EXO,,10,피터팬,EXO-K,,11,3.6.5,EXO-M,,12,Lucky,EXO,,VCR,13,Have Yourself A Merry Little Christmas(Judy Garland),첸, 루나,14,첫 눈,EXO,,15,Christmas Day,EXO,,17,Baby, Don`t Cry,수호, 백현, 찬열, 디오,,18,Thrift Shop(Macklemore & Ryan Lewis),크리스, 엠버,,19,Dance Break,카이, 세훈, 레이, 

In [171]:
from operator import itemgetter
from langchain_core.output_parsers import JsonOutputParser

parser = JsonOutputParser()

prompt = ChatPromptTemplate.from_messages(
    [
        # role, message
        ("system", """You are an expert in data parsing and JSON conversion. Your task is to analyze and transform structured text data into properly formatted JSON objects. The data may have the following characteristics:

                1. Some rows may be entirely empty. Ignore these rows and do not include them in the final output.
                2. If a row has a different number of columns than the previous rows, it indicates merged cells or inconsistent formatting. Interpret these cases carefully to maintain data consistency.
                3. Records may be structured either by rows or by columns. You need to analyze the entire dataset to determine whether the primary record structure should be row-based or column-based before generating the JSON.
                4. Strings in the form of [\d+] are footnotes, so they do not need to be included in the output.

                Your job is to accurately identify and handle these variations to create a clean, valid JSON output that reflects the data structure.
                Output only the final JSON representation, without any additional explanations or text.
                If the output format is a code block, please output it in JSON format with the code block removed.
                  """),
        ("human", """"I have a structured string representing tabular data where each row is separated by '\n\n' and columns by commas. 
        Here is the structured string:
        <string>{table}</string>
        Parse the input, then generate JSON where each subsequent row corresponds to a JSON object with the respective header keys.
""")
    ]
)

# 지시사항을 프롬프트에 주입합니다.
# prompt = prompt.partial(format_instructions=parser.get_format_instructions())

# 수정된 체인 생성 코드
chain = {
          'table': itemgetter('table') | RunnablePassthrough()
} | prompt | llm | parser

In [172]:
result = chain.invoke({"table":str_table})

In [173]:
import pprint

pprint.pprint(result)

{'date': [{'date': '2013년 12월 24일', 'location': '킨텍스 제1전시장 5홀'},
          {'date': '2013년 12월 25일', 'location': '킨텍스 제1전시장 5홀'}],
 'event': 'SMTOWN WEEKf(x) + EXO : Christmas Wonderland',
 'setlist': [{'artist': 'EXO',
              'note': '',
              'order': 4,
              'title': 'Let Out The Beast'},
             {'artist': 'EXO', 'note': '', 'order': 5, 'title': 'Black Pearl'},
             {'artist': 'EXO', 'note': '', 'order': 6, 'title': 'HISTORY'},
             {'artist': 'EXO-K', 'note': '', 'order': 10, 'title': '피터팬'},
             {'artist': 'EXO-M', 'note': '', 'order': 11, 'title': '3.6.5'},
             {'artist': 'EXO', 'note': '', 'order': 12, 'title': 'Lucky'},
             {'artist': '첸, 루나',
              'note': '(Judy Garland)',
              'order': 13,
              'title': 'Have Yourself A Merry Little Christmas'},
             {'artist': 'EXO', 'note': '', 'order': 14, 'title': '첫 눈'},
             {'artist': 'EXO',
              'note': '',
    