In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import pandas as pd
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, ElementNotInteractableException  #(클릭시 없을때, 엘리멘트 자체가 없을떄, 엘리멘트가 상호작용을 못할때 )
import os
import shutil 
from urllib.request import urlretrieve
import re



#웹브라우저를 띄우지 않고 진행하기 위한 설정
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

### get_politician_info(dir_name, wd) : 국회의원 정보 스크래핑

In [2]:
def get_politician_info(dir_name, wd):
    profile_info = wd.find_element(By.CSS_SELECTOR, '#contents > div.sp-person.contents > section.person-info.spacial-page.cl')
    cols = profile_info.find_elements(By.TAG_NAME, 'dt')
    columns = [col.text for col in cols]
    
    infos = profile_info.find_elements(By.TAG_NAME, 'dd')
    detail_info = [info.text for info in infos]
    
    info_dic = {col:info for col, info in zip(columns, detail_info)}
    
    return info_dic

### get_news_quote(dir_name, name, wd): 뉴스 인용문 스크래핑

In [3]:
def get_news_quote(dir_name, name, wd, news_max = 3):
    
    page_no = 0
    news_count = 0
    news_total = wd.find_element(By.ID, 'newsInQoutTotalCount').text[1:-2]
    news_file = open(dir_name + '/' + name + '뉴스 인용문.txt','w',encoding='UTF8')
    
    while True:
        try:
            page_no += 1
            print(f"---------(뉴스 인용문 {page_no} 페이지)----------")
            time.sleep(1)
            
            box_list = wd.find_element(By.ID, 'newsInQuotList')
            quotoes = box_list.find_elements(By.CLASS_NAME, 'title')
            
            
            for quoto in quotoes:
                news_file.write(quoto.text + '\n')
                news_count += 1
                print(f"{news_count}/{news_total}", [quoto.text])
                print()
                if news_count >= news_max:
                    break


            paging = wd.find_element(By.ID, 'newsInQuotListPaging')

            next_button = paging.find_element(By.CSS_SELECTOR, '#newsInQuotListPaging > a.page-next.page-link')
            if next_button.get_attribute('onclick') == None:
                break
                    
                         
            pagination = paging.find_element(By.CSS_SELECTOR, '#newsInQuotListPaging > a.page-next.page-link').click()
            time.sleep(3)

                    
            if news_count >= news_max:
                break
                

        except Exception as e:
            print(e)
            break
            
    news_file.close()

### get_record_quote(dir_name, name, wd) : 국회의원 회의록 인용문

In [4]:
def get_record_quote(dir_name, name, wd, record_max = 3):
    page_no = 0
    record_count = 0
    total_page = wd.find_element(By.CSS_SELECTOR, '#recordTableListPaging > span.total').text
#     print(count_records)
    total_page = int(total_page)
    count_records = wd.find_element(By.ID, 'recordTableList').find_elements(By.TAG_NAME,'tr')
    count_records = len(count_records)
    
    totals = total_page * count_records
    
    record_file = open(dir_name + '/' + name + '회의록 인용문.txt','w', encoding='UTF8')
    
    while True:
        try:
            page_no += 1
            print(f"---------(회의록 인용문 {page_no} 페이지)----------")
            
            
            record_table = wd.find_element(By.ID, 'recordTableList')
            wd.implicitly_wait(10)
            record_list = record_table.find_elements(By.TAG_NAME, 'a')
            wd.implicitly_wait(10)
            
            for record in record_list:
                record.click()
                wd.implicitly_wait(10)
                
                qoute_list = wd.find_element(By.ID, 'birefTableList')
                quotoes = qoute_list.find_elements(By.TAG_NAME, 'li')
            
                for quoto in quotoes:
                    record_file.write(quoto.text + '\n')
                    
                record_count += 1
                print(f"{record_count}/{totals}", [quoto.text])
                print()
                
                if record_count >= record_max:
                    break


                paging = wd.find_element(By.ID, 'newsInQuotListPaging')

                next_button = paging.find_element(By.CSS_SELECTOR, '#newsInQuotListPaging > a.page-next.page-link')
                if next_button.get_attribute('onclick') == None:
                    break
                    

                pagination = paging.find_element(By.CSS_SELECTOR, '#newsInQuotListPaging > a.page-next.page-link').click()
                time.sleep(3)

                    
            if record_count >= record_max:
                break
                

        except Exception as e:
            print(e)
            break
            
    record_file.close()

### scraping(): 스크래핑 함수

In [5]:
def scraping(dir_name, politician_max=3):
    page_no = 0
    politician_count = 0
    politician_df = pd.DataFrame()
    
    wd = webdriver.Chrome('chromedriver', options=chrome_options)
    wd.execute_script('window.open("about:blank", "_blank");')
    tabs = wd.window_handles
    
    while True:
        
        try:
            page_no += 1
            wd.switch_to.window(tabs[0])

            url = f"https://www.bigkinds.or.kr/v2/depthAnalysis/assembly.do?page={page_no}"
            wd.get(url)

            politician_total = wd.find_element(By.XPATH, '//*[@id="contents"]/section[1]/div/div/div[1]/div[2]').text
            politician_items = wd.find_elements(By.CSS_SELECTOR, '#contents > section.spacial-person.spacial-page > div > ul > li')

            if not politician_items:
                break

            for item in politician_items:
                wd.switch_to.window(tabs[0])

                politician_count += 1

                print('--------------------------------------')
                print(f"[국회의원 {politician_count}/{politician_total}명]" )
                name = item.find_element(By.CLASS_NAME, 'sp-kname').text
    #             name = name[:3] 이름 자를때 쓰자!
                print(name)
                
                detail_link = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
#                 print(detail_link)
                wd.switch_to.window(tabs[1])
                time.sleep(1)
                wd.get(detail_link)
                time.sleep(1)
                
                print("[프로필 이미지 다운로드]")
                profile_image = wd.find_element(By.CLASS_NAME, 'thumb')
                image_src = profile_image.find_element(By.TAG_NAME, 'img').get_attribute('src')
                file_name = dir_name + '/' + name + '.jpg'
                print(file_name)
                urlretrieve(image_src, file_name)
                
                print("[기본 정보 스크래핑]")
                politician_dic = get_politician_info(dir_name, wd)
                print("    ",politician_dic)
                politician_df = politician_df.append(politician_dic, ignore_index=True)
                
                print("[뉴스 인용문 스크래핑]")
                tab = wd.find_element(By.CSS_SELECTOR, 'li.analysisTab-01.ui-state-active').find_element(By.TAG_NAME, 'a').click()
                time.sleep(1)
                get_news_quote(dir_name, name, wd)
                
                
                print("[회의록 인용문 스크래핑]")
                another_tab = wd.find_element(By.CSS_SELECTOR, '#contents > div.sp-person.contents > section.person-desc.spacial-page > div > ul > li:nth-child(2) > a').send_keys(Keys.ENTER)
                time.sleep(3)
                wd.implicitly_wait(10)
                get_record_quote(dir_name, name, wd)
                
    

                if politician_count >= politician_max:
                    break
            if politician_count >= politician_max:
                    break
        except AttributeError as e:
            print(e)
            break
            
        except NoSuchElementException as e:
            print(e)
            break
    
    wd.close()
    
    return politician_df


### 스크래핑 시작

In [6]:
dir_name = './politicial' #폴더 생성
if os.path.isdir(dir_name): #폴더가 있으면
    shutil.rmtree(dir_name) #지움

os.makedirs(dir_name)
print(f"'DIR_NAME': {dir_name} 디렉토리 생성")

politician_df = scraping(dir_name)

'DIR_NAME': ./politicial 디렉토리 생성
--------------------------------------
[국회의원 1/295명]
강기윤(姜起潤)
[프로필 이미지 다운로드]
./politicial/강기윤(姜起潤).jpg
[기본 정보 스크래핑]
     {'지역구': '경남 창원시성산구', '당선기록': '재선(19대, 21대)', '소속위원회': '보건복지위원회', '사무실전화': '02-784-1751', '보좌관': '김홍광 , 한영애', '이메일주소': 'ggotop@naver.com', '홈페이지': 'http://blog.naver.com/ggotop', '경력': '[학력] 마산공고(26회) 창원대학교 행정학과 중앙대학교 행정대학원 지방의회과 석사 창원대학교 대학원 행정학 박사 [경력] 현) 국회 보건복지위원회 국민의힘 간사 현) 국민의힘 소상공인살리기 특별위원회 부위원장 현) 국민의힘 코로나19 대책 특별위원회 위원 미래통합당 경남도당 민생특위 위원장 제19대 국회의원 (새누리당/경남 창원시 성산구) 새누리당 원내부대표'}
[뉴스 인용문 스크래핑]
---------(뉴스 인용문 1 페이지)----------


KeyboardInterrupt: 

In [None]:
politician_df