# 1. 주제선정
* 실습 주제: Kaggle 대회 데이터 크롤링
* 목표: Kaggle 사이트에서 개최한 대회(competition)의 제목, 설명, 개최 년도, 상금 등의 데이터를 크롤링해 메타데이터를 얻는다.
* 대상 사이트: [Kaggle Competitions](https://www.kaggle.com/competitions)

# 2. 데이터수집

## 데이터 종류 정의
* https://www.kaggle.com/competitions/
    * 제목
    * URL
    * 설명
    * 상금

* https://www.kaggle.com/c/{대회}
    * 시작 날짜
    * 주최 기간

In [1]:
title_list = []
url_list = []
desc_list = []
prize_list = []

date_list = []
org_list = []

## 페이지 크롤링 및 정보 추출
* https://www.kaggle.com/competitions
* 제목, URL, 설명, 상금
* selenium을 이용하여 크롤링

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [3]:
driver_path = 'C:\\JupyterLab\\chromedriver.exe'  # 드라이버 경로에 맞게 바꿔 주세요
base_url = 'https://www.kaggle.com'
competitions_url = 'https://www.kaggle.com/competitions'

In [4]:
driver = webdriver.Chrome(driver_path)

In [5]:
driver.get(competitions_url)
print(driver.title)

Kaggle Competitions


### 1페이지 크롤링
* https://www.kaggle.com/competitions?page=1 이
* https://www.kaggle.com/competitions 로 리다이렉트됨
* 1페이지는 따로 크롤링

In [6]:
# All competitions 버튼 클릭
# CSS class 이름은 무작위적으로 생성되어 생략

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

elem = driver.find_element_by_css_selector("#site-content > div:nth-child(4) > div > div > div:nth-child(2) > div > button:nth-child(1)")
elem.click()

# Results에 항목이 표시될 때까지 기다리기
_ = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
    (By.CSS_SELECTOR, '#site-content > div:nth-child(4) > div > div > div > ul > li:nth-child(1) > div > a > div > div')))

In [7]:
from bs4 import BeautifulSoup

# 데이터 추출 함수 정의
# 전역 변수 list에 append()하여 저장
def extract_info(page):
    soup = BeautifulSoup(page)
    ul = soup.select_one('#site-content > div:nth-child(4) > div > div > div > ul')
    for li in ul:
        title = li.select_one('div > a > div:nth-child(2) > div').get_text()
        url = f"{base_url}{li.select_one('div > a')['href']}"
        desc = li.select_one('div > a > div:nth-child(2) > span').get_text()
        prize = li.select_one('div > a > div > div > div').get_text()

        title_list.append(title)
        url_list.append(url)
        desc_list.append(desc)
        prize_list.append(prize)

In [8]:
extract_info(driver.page_source)

### 2페이지~끝 크롤링
* https://www.kaggle.com/competitions?page={번호}

In [9]:
import time

url = 'https://www.kaggle.com/competitions?page={}'
page_no = 25  # 과도한 크롤링 방지를 위해 25로 설정. 2로 놓으면 모든 페이지 크롤링 가능

while True:
    driver.get(url.format(page_no))
    # Results에 항목이 표시될 때까지 기다리기
    # 결과가 있는 페이지와 없는 페이지의 구성이 달라 WebDriverWait로는 어려워 time.sleep() 사용
    time.sleep(3)
    
    # 끝 페이지에서 종료
    if "No results found" in driver.page_source:
        break
        
    # 데이터 추출
    extract_info(driver.page_source)
    
    page_no += 1
    
    # 서버에게 부담 가지 않게 기다리기
    time.sleep(1)

for l in [title_list, url_list, desc_list, prize_list]:
    assert len(l) >= 1

### 결과물 확인

In [10]:
print('-'*50)
for (title, url, desc, prize) in list(zip(title_list, url_list, desc_list, prize_list))[:5]:
    print(title)
    print(url)
    print(desc)
    print(prize)
    print('-'*50)

--------------------------------------------------
Wikipedia - Image/Caption Matching
https://www.kaggle.com/c/wikipedia-image-caption
Retrieve captions based on images
Swag
--------------------------------------------------
Tabular Playground Series - Sep 2021
https://www.kaggle.com/c/tabular-playground-series-sep-2021
Practice your ML skills on this approachable dataset!
Swag
--------------------------------------------------
Lux AI
https://www.kaggle.com/c/lux-ai-2021
Gather the most resources and survive the night!
$10,000
--------------------------------------------------
chaii - Hindi and Tamil Question Answering
https://www.kaggle.com/c/chaii-hindi-and-tamil-question-answering
Identify the answer to questions found in Indian language passages
$10,000
--------------------------------------------------
Google Landmark Retrieval 2021
https://www.kaggle.com/c/landmark-retrieval-2021
Given an image, can you find all of the same landmarks in a dataset?
Swag
---------------------------

## URL 이용하여 추가정보 수집
* https://www.kaggle.com/c/{대회}
* 시작 날짜, 주최 기관
* 마감, 종료 날짜는 구성이 달라 찾기 어려운 페이지가 있어서 포기

In [11]:
for url in url_list:
    driver.get(url)
    # Competition 정보가 표시될 때까지 기다리기
    WebDriverWait(driver, 10).until(EC.presence_of_element_located(
        (By.CSS_SELECTOR, '.competition-overview')))
    
    soup = BeautifulSoup(driver.page_source)
    date = soup.select_one('div.horizontal-timeline__point > div.horizontal-timeline__point-tooltip-wrapper > span')['data-tooltip']
    date_list.append(date)
    
    # 구성이 달라 찾기 어려운 페이지가 있어서 포기
#     try:
#         deadline = soup.select_one('div.horizontal-timeline__milestone-placeholder > div > div.horizontal-timeline__point-tooltip-wrapper > span')['data-tooltip']
#     except:
#         deadline = None
#     close = soup.select_one('div.horizontal-timeline__point--future > div.horizontal-timeline__point-tooltip-wrapper > span')['data-tooltip']

    try:
        org = soup.select_one('#site-content > div.competition > div > div.competition-header__container > div > div > div.competition-header__top > div.pageheader__top--safe > div.competition-header__details > div > ul > li:nth-child(1) > span > span.competition-header__organization-name').get_text()
    except:
        org = None
    org_list.append(org)
    
    # 서버에게 부담 가지 않게 기다리기
    time.sleep(1)

## DataFrame 생성

In [12]:
import pandas as pd

df = pd.DataFrame({
    'title': title_list,
    'url': url_list,
    'description': desc_list,
    'organization': org_list,
    'prize': prize_list,
    'start_date': date_list
})
df

Unnamed: 0,title,url,description,organization,prize,start_date
0,Wikipedia - Image/Caption Matching,https://www.kaggle.com/c/wikipedia-image-caption,Retrieve captions based on images,Wikimedia Foundation,Swag,"Sep 14, 2021"
1,Tabular Playground Series - Sep 2021,https://www.kaggle.com/c/tabular-playground-se...,Practice your ML skills on this approachable d...,Kaggle,Swag,"Sep 1, 2021"
2,Lux AI,https://www.kaggle.com/c/lux-ai-2021,Gather the most resources and survive the night!,Lux AI Challenge,"$10,000","Aug 17, 2021"
3,chaii - Hindi and Tamil Question Answering,https://www.kaggle.com/c/chaii-hindi-and-tamil...,Identify the answer to questions found in Indi...,Google,"$10,000","Aug 12, 2021"
4,Google Landmark Retrieval 2021,https://www.kaggle.com/c/landmark-retrieval-2021,"Given an image, can you find all of the same l...",Google,Swag,"Aug 11, 2021"
5,Google Landmark Recognition 2021,https://www.kaggle.com/c/landmark-recognition-...,"Label famous, and not-so-famous, landmarks in ...",Google,Swag,"Aug 11, 2021"
6,NFL Health & Safety - Helmet Assignment,https://www.kaggle.com/c/nfl-health-and-safety...,Segment and label helmets in video footage,The National Football League,"$100,000","Aug 11, 2021"
7,LearnPlatform COVID-19 Impact on Digital Learning,https://www.kaggle.com/c/learnplatform-covid19...,Use digital learning data to analyze the impac...,LearnPlatform,"$20,000","Aug 3, 2021"
8,Tabular Playground Series - Aug 2021,https://www.kaggle.com/c/tabular-playground-se...,Practice your ML skills on this approachable d...,Kaggle,Swag,"Aug 1, 2021"
9,RSNA-MICCAI Brain Tumor Radiogenomic Classific...,https://www.kaggle.com/c/rsna-miccai-brain-tum...,Predict the status of a genetic biomarker impo...,Radiological Society of North America,"$30,000","Jul 14, 2021"


## 마무리

In [13]:
driver.close()

# Open API
* 한국환경공단 에어코리아 미세먼지 경보 발령 현황
* https://www.data.go.kr/tcs/dss/selectApiDataDetailView.do?publicDataPk=15073885

## 데이터 가져오기
* reqeusts 모듈로 API 호출
* xmltodict, json 모듈로 데이터 파싱

In [1]:
import requests

serviceKey = '7bWg%2BZ5MQoQsRHO2iRbkApDKVivKsEO5Gj3cx2hoBBDDpBEH7BAd7ggQxo6yGhKGUu0WhoZf1k2KUC%2BmD9e38A%3D%3D'
year = 2021

url = 'http://apis.data.go.kr/B552584/UlfptcaAlarmInqireSvc/getUlfptcaAlarmInfo?serviceKey={}&year={}'.format(serviceKey, year)
res = requests.get(url)
res

<Response [200]>

In [2]:
res.text[:1000]

'<?xml version="1.0" encoding="UTF-8"?>\r\n<response>\n  <header>\n    <resultCode>00</resultCode>\n    <resultMsg>NORMAL_CODE</resultMsg>\n  </header>\n  <body>\n    <items>\n      <item>\n        <clearVal>72</clearVal>\n        <sn>3483</sn>\n        <districtName>경남</districtName>\n        <dataDate>2021-05-25</dataDate>\n        <issueVal>161</issueVal>\n        <issueTime>11:00</issueTime>\n        <clearDate>2021-05-25</clearDate>\n        <issueDate>2021-05-25</issueDate>\n        <moveName>하동권역</moveName>\n        <clearTime>14:00</clearTime>\n        <issueGbn>주의보</issueGbn>\n        <itemCode>PM10</itemCode>\n      </item>\n      <item>\n        <clearVal>81</clearVal>\n        <sn>3475</sn>\n        <districtName>강원</districtName>\n        <dataDate>2021-05-25</dataDate>\n        <issueVal>182</issueVal>\n        <issueTime>02:00</issueTime>\n        <clearDate>2021-05-25</clearDate>\n        <issueDate>2021-05-25</issueDate>\n        <moveName>영동남부</moveName>\n        <cle

In [3]:
import json
import xmltodict

data = json.loads(json.dumps(xmltodict.parse(res.text)))
data

{'response': {'header': {'resultCode': '00', 'resultMsg': 'NORMAL_CODE'},
  'body': {'items': {'item': [{'clearVal': '72',
      'sn': '3483',
      'districtName': '경남',
      'dataDate': '2021-05-25',
      'issueVal': '161',
      'issueTime': '11:00',
      'clearDate': '2021-05-25',
      'issueDate': '2021-05-25',
      'moveName': '하동권역',
      'clearTime': '14:00',
      'issueGbn': '주의보',
      'itemCode': 'PM10'},
     {'clearVal': '81',
      'sn': '3475',
      'districtName': '강원',
      'dataDate': '2021-05-25',
      'issueVal': '182',
      'issueTime': '02:00',
      'clearDate': '2021-05-25',
      'issueDate': '2021-05-25',
      'moveName': '영동남부',
      'clearTime': '13:00',
      'issueGbn': '주의보',
      'itemCode': 'PM10'},
     {'clearVal': '94',
      'sn': '3481',
      'districtName': '경남',
      'dataDate': '2021-05-25',
      'issueVal': '154',
      'issueTime': '11:00',
      'clearDate': '2021-05-25',
      'issueDate': '2021-05-25',
      'moveName': '고

In [4]:
import pandas as pd

df = pd.DataFrame(data['response']['body']['items']['item'])
df

Unnamed: 0,clearVal,sn,districtName,dataDate,issueVal,issueTime,clearDate,issueDate,moveName,clearTime,issueGbn,itemCode
0,72,3483,경남,2021-05-25,161,11:00,2021-05-25,2021-05-25,하동권역,14:00,주의보,PM10
1,81,3475,강원,2021-05-25,182,02:00,2021-05-25,2021-05-25,영동남부,13:00,주의보,PM10
2,94,3481,경남,2021-05-25,154,11:00,2021-05-25,2021-05-25,고성권역,14:00,주의보,PM10
3,76,3482,전남,2021-05-25,157,11:00,2021-05-25,2021-05-25,동부권역,14:00,주의보,PM10
4,90,3484,경남,2021-05-25,152,12:00,2021-05-25,2021-05-25,사천권역,14:00,주의보,PM10
5,96,3477,경기,2021-05-25,109,08:00,2021-05-25,2021-05-25,동부권,09:00,주의보,PM10
6,63,3486,경북,2021-05-25,182,14:00,2021-05-25,2021-05-25,울릉권역,15:00,주의보,PM10
7,62,3478,경남,2021-05-25,174,08:00,2021-05-25,2021-05-25,남해권역,15:00,주의보,PM10
8,80,3487,부산,2021-05-25,151,14:00,2021-05-25,2021-05-25,중부권역,16:00,주의보,PM10
9,66,3479,경남,2021-05-25,180,10:00,2021-05-25,2021-05-25,통영권역,15:00,주의보,PM10


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   clearVal      10 non-null     object
 1   sn            10 non-null     object
 2   districtName  10 non-null     object
 3   dataDate      10 non-null     object
 4   issueVal      10 non-null     object
 5   issueTime     10 non-null     object
 6   clearDate     10 non-null     object
 7   issueDate     10 non-null     object
 8   moveName      10 non-null     object
 9   clearTime     10 non-null     object
 10  issueGbn      10 non-null     object
 11  itemCode      10 non-null     object
dtypes: object(12)
memory usage: 1.1+ KB


## 데이터 전처리
* 데이터 타입 바꾸기
* 날짜 -> pandas datetime
* sn(관리번호) -> int

In [6]:
for key in ['dataDate', 'clearDate', 'issueDate']:
    df['year'] = df[key].apply(lambda x: x.split('-')[0])
    df['month'] = df[key].apply(lambda x: x.split('-')[1])
    df['day'] = df[key].apply(lambda x: x.split('-')[2])
    df[key] = pd.to_datetime(df[['year', 'month', 'day']])
    
df.drop(['year', 'month', 'day'], axis=1, inplace=True)
df

Unnamed: 0,clearVal,sn,districtName,dataDate,issueVal,issueTime,clearDate,issueDate,moveName,clearTime,issueGbn,itemCode
0,72,3483,경남,2021-05-25,161,11:00,2021-05-25,2021-05-25,하동권역,14:00,주의보,PM10
1,81,3475,강원,2021-05-25,182,02:00,2021-05-25,2021-05-25,영동남부,13:00,주의보,PM10
2,94,3481,경남,2021-05-25,154,11:00,2021-05-25,2021-05-25,고성권역,14:00,주의보,PM10
3,76,3482,전남,2021-05-25,157,11:00,2021-05-25,2021-05-25,동부권역,14:00,주의보,PM10
4,90,3484,경남,2021-05-25,152,12:00,2021-05-25,2021-05-25,사천권역,14:00,주의보,PM10
5,96,3477,경기,2021-05-25,109,08:00,2021-05-25,2021-05-25,동부권,09:00,주의보,PM10
6,63,3486,경북,2021-05-25,182,14:00,2021-05-25,2021-05-25,울릉권역,15:00,주의보,PM10
7,62,3478,경남,2021-05-25,174,08:00,2021-05-25,2021-05-25,남해권역,15:00,주의보,PM10
8,80,3487,부산,2021-05-25,151,14:00,2021-05-25,2021-05-25,중부권역,16:00,주의보,PM10
9,66,3479,경남,2021-05-25,180,10:00,2021-05-25,2021-05-25,통영권역,15:00,주의보,PM10


In [7]:
df['dataDate']

0   2021-05-25
1   2021-05-25
2   2021-05-25
3   2021-05-25
4   2021-05-25
5   2021-05-25
6   2021-05-25
7   2021-05-25
8   2021-05-25
9   2021-05-25
Name: dataDate, dtype: datetime64[ns]

In [8]:
df['sn'] = df['sn'].astype(int)
df['sn']

0    3483
1    3475
2    3481
3    3482
4    3484
5    3477
6    3486
7    3478
8    3487
9    3479
Name: sn, dtype: int32

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   clearVal      10 non-null     object        
 1   sn            10 non-null     int32         
 2   districtName  10 non-null     object        
 3   dataDate      10 non-null     datetime64[ns]
 4   issueVal      10 non-null     object        
 5   issueTime     10 non-null     object        
 6   clearDate     10 non-null     datetime64[ns]
 7   issueDate     10 non-null     datetime64[ns]
 8   moveName      10 non-null     object        
 9   clearTime     10 non-null     object        
 10  issueGbn      10 non-null     object        
 11  itemCode      10 non-null     object        
dtypes: datetime64[ns](3), int32(1), object(8)
memory usage: 1.0+ KB
