# Earth System Science Data 크롤러

- 2009 ~ 2019년 () 게재 논문 저자와 이메일을 크롤링하는 크롤러입니다.
- 크롤링 결과는 'Platform', 'Year', 'Volume', 'Author', 'Email', 'URL'의 6개 칼럼으로 구성된 데이터프레임으로 정리하였습니다.


In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd

## 1. URL 모으기

- volume 별(`vol_list`), issue 별(`issues`) url을 모으는 부분. 이후에 크롤링 부분에서 여기서 모은 issue 별 url에 접속하여 개별 article의 url을 `article_url` 변수에 저장하여 활용하였음.

In [None]:
root = 'https://www.earth-syst-sci-data.net/volumes.html'

driver = webdriver.Chrome('../chromedriver.exe')

driver.get(root)

vol_list = driver.find_elements_by_class_name('grid-100.volumes')

issues = []
for vol in vol_list:
    issues.extend(vol.find_elements_by_css_selector('ul > li > span > a'))
    
issues = [issue.get_attribute('href') for issue in issues]

issues = issues[1:]   #2020년 issue 제외

## 2. 크롤링

In [99]:
df = pd.DataFrame(columns = ['Platform', 'Year', 'Volume', 'Author', 'Email', 'URL'])

#driver = webdriver.Chrome('../chromedriver.exe')
#wait = WebDriverWait(driver, 10)

for issue in issues:
    driver.get(issue)
    article_list = driver.find_elements_by_class_name('article-title')
    article_url = [a.get_attribute('href') for a in article_list]
    for article in article_url:
        driver.get(article)
        short = driver.find_element_by_class_name('authors-short')
        short.click()
        try:
            correspondence = driver.find_element_by_class_name('correspondence-statement').text
            author = correspondence.split('(')[0].replace('Correspondence: ', '').strip()
            email = correspondence.split('(')[-1].replace(')', '')
            year = article.split('/')[-2]
            volume = article.split('/')[-4]
            df = df.append({'Platform': 'Earth System Science Data', 'Year': year, 'Volume': volume, 'Author': author, 'Email': email, 'URL': article}, ignore_index=True)
        except:
            pass

print('finished')

finished


## 3. 확인 후 저장

In [100]:
df

Unnamed: 0,Platform,Year,Volume,Author,Email,URL
0,Earth System Science Data,2019,11,Michal Mikolaj,mikolaj@gfz-potsdam.de,https://www.earth-syst-sci-data.net/11/1501/2019/
1,Earth System Science Data,2019,11,Panagiotis Athanasiou,panos.athanasiou@deltares.nl,https://www.earth-syst-sci-data.net/11/1515/2019/
2,Earth System Science Data,2019,11,Cristian Lussana,critianl@met.no,https://www.earth-syst-sci-data.net/11/1531/2019/
3,Earth System Science Data,2019,11,Matthew Gard,matthew.gard@adelaide.edu.au,https://www.earth-syst-sci-data.net/11/1553/2019/
4,Earth System Science Data,2019,11,Adam S. Ward,adamward@indiana.edu,https://www.earth-syst-sci-data.net/11/1567/2019/
...,...,...,...,...,...,...
362,Earth System Science Data,2015,7,M. P. Humphreys,m.p.humphreys@soton.ac.uk,https://www.earth-syst-sci-data.net/7/127/2015/
363,Earth System Science Data,2015,7,T. W. Estilow,thomas.estilow@rutgers.edu,https://www.earth-syst-sci-data.net/7/137/2015/
364,Earth System Science Data,2015,7,M. Tanguy,malngu@ceh.ac.uk,https://www.earth-syst-sci-data.net/7/143/2015/
365,Earth System Science Data,2014,6,U. Löptien,uloeptien@geomar.de,https://www.earth-syst-sci-data.net/6/367/2014/


In [101]:
df.to_csv('../csv/Earth_System_Science_Data.txt', encoding = 'utf-8')