In [7]:
!scrapy crawl bond

^C


In [6]:
!scrapy crawl bond --logfile=log.txt

^C


In [1]:
!scrapy crawl bond -s LOG_LEVEL=INFO

2024-08-19 13:42:11 [scrapy.utils.log] INFO: Scrapy 2.11.2 started (bot: bond)
2024-08-19 13:42:11 [scrapy.utils.log] INFO: Versions: lxml 4.9.3.0, libxml2 2.10.4, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 24.0.0 (OpenSSL 3.0.13 30 Jan 2024), cryptography 42.0.2, Platform Windows-10-10.0.18363-SP0
2024-08-19 13:42:11 [scrapy.addons] INFO: Enabled addons:
[]
2024-08-19 13:42:11 [scrapy.extensions.telnet] INFO: Telnet Password: bfa312124b96ff69
2024-08-19 13:42:11 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2024-08-19 13:42:11 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'bond',
 'FEED_EXPORT_ENCODING': 'utf-8',
 'LOG_LEVEL': 'INFO',
 'NEWSPIDER_MODULE': 'bond.spiders',
 'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',
 'S

In [1]:
import re
import os
import json
import pdfplumber
from multiprocessing import Pool, cpu_count
from functools import partial

# PDF 파일들이 저장된 디렉토리
pdf_dir = "downloaded_pdfs"
json_file_path = "pdf_texts.json"
temp_json_path = "temp_json.json"

# 비정상 텍스트를 필터링하는 함수
def filter_text(text):
    cleaned_text = re.sub(r'[^\w\s.,;!?\'"()\-]', '', text)
    return cleaned_text

# PDF 파일을 처리하는 함수
def process_pdf(pdf_file, pdf_dir, temp_json_path):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    result = {}
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() or ""
        
        cleaned_text = filter_text(text)
        announce_date = pdf_file.split("_")[0]
        
        if cleaned_text.strip():
            result[announce_date] = cleaned_text
        
        os.remove(pdf_path)
    
    except Exception as e:
        print(f"Error processing file {pdf_file}: {e}")
    
    # 결과를 임시 JSON 파일에 저장
    if result:
        with open(temp_json_path, 'a', encoding='utf-8') as file:
            json.dump(result, file, ensure_ascii=False, indent=4)
            file.write('\n')  # 각 JSON 객체를 새 줄로 구분

# PDF 파일 목록을 가져옵니다
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]

# 멀티프로세싱을 사용하여 PDF 파일을 처리하는 함수
def process_files_in_batches(pdf_files, batch_size):
    with Pool(processes=cpu_count()) as pool:
        for i in range(0, len(pdf_files), batch_size):
            batch = pdf_files[i:i + batch_size]
            pool.map(partial(process_pdf, pdf_dir=pdf_dir, temp_json_path=temp_json_path), batch)
    
    # 모든 파일 처리가 끝난 후 JSON 파일에 합치기
    merge_json_files(temp_json_path, json_file_path)

# JSON 파일 합치기
def merge_json_files(temp_json_path, json_file_path):
    combined_data = {}
    with open(temp_json_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip():  # 비어있지 않은 줄만 처리
                data = json.loads(line)
                combined_data.update(data)
    
    with open(json_file_path, 'w', encoding='utf-8') as file:
        json.dump(combined_data, file, ensure_ascii=False, indent=4)
    
    os.remove(temp_json_path)

# 배치 크기를 설정하여 처리
process_files_in_batches(pdf_files, batch_size=50)

In [4]:
import fitz  # PyMuPDF
import os
import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed

def filter_text(text):
    cleaned_text = re.sub(r'[^\w\s.,;!?\'"()\-]', '', text)
    return cleaned_text

# PDF 파일에서 텍스트를 추출하는 함수
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        cleaned_text = filter_text(text)
        doc.close()
        return os.path.basename(pdf_path), cleaned_text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return os.path.basename(pdf_path), None
    


# PDF 폴더 경로
pdf_folder = 'downloaded_pdfs'
output_json_file = 'pdf_texts.json'
max_threads = 6  # 적절한 스레드 수로 조정하세요

def process_pdfs(pdf_folder, output_json_file, max_threads):
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
    
    # 결과를 저장할 딕셔너리
    results = []

    with ThreadPoolExecutor(max_threads) as executor:
        # 스레드 풀을 통해 PDF 파일 처리
        future_to_pdf = {executor.submit(extract_text_from_pdf, pdf): pdf for pdf in pdf_files}
        for future in as_completed(future_to_pdf):
            pdf_file = future_to_pdf[future]
            try:
                pdf_name, text = future.result()
                print(pdf_name)
                if text:
                    pdf_name = pdf_name.split("_")[0]
                    result = {
                        "date": pdf_name,  # 파일의 마지막 수정 시간
                        "text": text
                    }
                    results.append(result)
            except Exception as e:
                print(f"Error processing {pdf_file}: {e}")

    # JSON 파일로 저장
    with open(output_json_file, 'w', encoding='utf-8') as json_file:
        json.dump(results, json_file, ensure_ascii=False, indent=4)

    print(f"Text extraction complete. Results saved to {output_json_file}")

# 실행
process_pdfs(pdf_folder, output_json_file, max_threads)

2008-06-12_18.pdf
2008-07-10_8.pdf
2008-07-10_5.pdf
2008-08-07_1.pdf
2008-06-12_7.pdf
2008-06-12.pdf
2008-07-10.pdf
2008-06-12_4.pdf
2008-07-10_7.pdf
2008-07-10_22.pdf
2008-05-08_3.pdf
2008-08-07_13.pdf
2008-06-12_17.pdf
2008-06-12_16.pdf
2008-07-10_2.pdf
2008-06-12_14.pdf
2008-06-12_1.pdf
2008-06-12_15.pdf
2008-08-07_11.pdf
2008-07-10_4.pdf
2008-07-10_1.pdf
2008-07-10_20.pdf
2008-07-10_14.pdf
2008-06-12_12.pdf
2008-07-10_21.pdf
2008-07-10_10.pdf
2008-07-10_12.pdf
2008-07-10_16.pdf
2008-06-12_2.pdf
2008-07-10_9.pdf
2008-07-10_6.pdf
2008-07-10_11.pdf
2008-07-10_15.pdf
2008-08-07.pdf
2008-06-12_11.pdf
2008-06-12_10.pdf
2008-07-10_19.pdf
2008-08-07_10.pdf
2008-06-12_9.pdf
2008-06-12_6.pdf
2008-06-12_3.pdf
2008-06-12_13.pdf
2008-05-08_1.pdf
2008-07-10_18.pdf
2008-07-10_17.pdf
2008-07-10_13.pdf
2008-06-12_5.pdf
2008-06-12_8.pdf
2008-07-10_3.pdf
2008-05-08_2.pdf
2008-05-08.pdf
2008-08-07_16.pdf
2008-08-07_17.pdf
2008-08-07_12.pdf
2008-08-07_19.pdf
2008-08-07_18.pdf
2008-08-07_21.pdf
2008-08-

In [3]:
 pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
pdf_files

['downloaded_pdfs\\2008-05-08.pdf',
 'downloaded_pdfs\\2008-05-08_1.pdf',
 'downloaded_pdfs\\2008-05-08_2.pdf',
 'downloaded_pdfs\\2008-05-08_3.pdf',
 'downloaded_pdfs\\2008-06-12.pdf',
 'downloaded_pdfs\\2008-06-12_1.pdf',
 'downloaded_pdfs\\2008-06-12_10.pdf',
 'downloaded_pdfs\\2008-06-12_11.pdf',
 'downloaded_pdfs\\2008-06-12_12.pdf',
 'downloaded_pdfs\\2008-06-12_13.pdf',
 'downloaded_pdfs\\2008-06-12_14.pdf',
 'downloaded_pdfs\\2008-06-12_15.pdf',
 'downloaded_pdfs\\2008-06-12_16.pdf',
 'downloaded_pdfs\\2008-06-12_17.pdf',
 'downloaded_pdfs\\2008-06-12_18.pdf',
 'downloaded_pdfs\\2008-06-12_2.pdf',
 'downloaded_pdfs\\2008-06-12_3.pdf',
 'downloaded_pdfs\\2008-06-12_4.pdf',
 'downloaded_pdfs\\2008-06-12_5.pdf',
 'downloaded_pdfs\\2008-06-12_6.pdf',
 'downloaded_pdfs\\2008-06-12_7.pdf',
 'downloaded_pdfs\\2008-06-12_8.pdf',
 'downloaded_pdfs\\2008-06-12_9.pdf',
 'downloaded_pdfs\\2008-07-10.pdf',
 'downloaded_pdfs\\2008-07-10_1.pdf',
 'downloaded_pdfs\\2008-07-10_10.pdf',
 'downlo

In [7]:
import pandas as pd
import json

fp = 'pdf_texts.json'
df = pd.read_json(fp)

df.rename(columns={'text': 'content'}, inplace=True)
df['content'] = df['content'].str.replace('\n', ' ', regex=False)

In [9]:
df.to_csv('bond.csv', index = False)