# Web Scraping

# JSON format
- json: JavaScript Object Notation (자바 스크립트 객체 표기법)
- 데이터를 쉽게 '교환' 하고 '저장' 하기 위한 텍스트 기반의 데이터 교환 표준

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import json
from bs4 import BeautifulSoup

In [80]:
obj = """
{
    "name": "Kim",
    "places_lived": ["Seoul", "Korea"],
    "pet": null, 
    "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"}]
}
"""

In [81]:
type(obj)

str

In [82]:
r = json.loads(obj)   # decoding (json --> dict)
type(r)

dict

In [83]:
json.dumps(r)    # encoding (dict --> json)

'{"name": "Kim", "places_lived": ["Seoul", "Korea"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}]}'

In [84]:
# Exercise 2 (from https://rfriend.tistory.com/474)
py_data = {

    "1.FirstName": "Gildong",
    "2.LastName": "Hong",
    "3.Age": 20,
    "4.University": "Hangook University",
    "5.Courses": [
        {
            "Classes": [
                "Probability",
                "Generalized Linear Model",
                "Categorical Data Analysis"
            ],
            "Major": "Statistics"
        },
        {
            "Classes": [
                "Data Structure",
                "Programming",
                "Algorithms"
            ],
            "Minor": "ComputerScience"
        }
    ]
}

In [85]:
type(py_data)

dict

In [86]:
py_data.keys()
# py_data.items()

dict_keys(['1.FirstName', '2.LastName', '3.Age', '4.University', '5.Courses'])

In [87]:
py_data.values()

dict_values(['Gildong', 'Hong', 20, 'Hangook University', [{'Classes': ['Probability', 'Generalized Linear Model', 'Categorical Data Analysis'], 'Major': 'Statistics'}, {'Classes': ['Data Structure', 'Programming', 'Algorithms'], 'Minor': 'ComputerScience'}]])

In [88]:
py_data['5.Courses']

[{'Classes': ['Probability',
   'Generalized Linear Model',
   'Categorical Data Analysis'],
  'Major': 'Statistics'},
 {'Classes': ['Data Structure', 'Programming', 'Algorithms'],
  'Minor': 'ComputerScience'}]

In [89]:
import json
json_str = json.dumps(py_data)
type(json_str), json_str

(str,
 '{"1.FirstName": "Gildong", "2.LastName": "Hong", "3.Age": 20, "4.University": "Hangook University", "5.Courses": [{"Classes": ["Probability", "Generalized Linear Model", "Categorical Data Analysis"], "Major": "Statistics"}, {"Classes": ["Data Structure", "Programming", "Algorithms"], "Minor": "ComputerScience"}]}')

In [90]:
pd.Series(py_data)

1.FirstName                                               Gildong
2.LastName                                                   Hong
3.Age                                                          20
4.University                                   Hangook University
5.Courses       [{'Classes': ['Probability', 'Generalized Line...
dtype: object

In [91]:
pd.DataFrame(py_data)

Unnamed: 0,1.FirstName,2.LastName,3.Age,4.University,5.Courses
0,Gildong,Hong,20,Hangook University,"{'Classes': ['Probability', 'Generalized Linea..."
1,Gildong,Hong,20,Hangook University,"{'Classes': ['Data Structure', 'Programming', ..."


In [92]:
pd.DataFrame(py_data).iloc[:,-1]

0    {'Classes': ['Probability', 'Generalized Linea...
1    {'Classes': ['Data Structure', 'Programming', ...
Name: 5.Courses, dtype: object

In [93]:
pd.DataFrame.from_dict(py_data)

Unnamed: 0,1.FirstName,2.LastName,3.Age,4.University,5.Courses
0,Gildong,Hong,20,Hangook University,"{'Classes': ['Probability', 'Generalized Linea..."
1,Gildong,Hong,20,Hangook University,"{'Classes': ['Data Structure', 'Programming', ..."


In [94]:
pd.json_normalize(py_data)

Unnamed: 0,1.FirstName,2.LastName,3.Age,4.University,5.Courses
0,Gildong,Hong,20,Hangook University,"[{'Classes': ['Probability', 'Generalized Line..."


In [95]:
pd.json_normalize(py_data, "5.Courses")

Unnamed: 0,Classes,Major,Minor
0,"[Probability, Generalized Linear Model, Catego...",Statistics,
1,"[Data Structure, Programming, Algorithms]",,ComputerScience


In [101]:
pd.json_normalize(py_data, "5.Courses", ['3.Age'])

Unnamed: 0,Classes,Major,Minor,3.Age
0,"[Probability, Generalized Linear Model, Catego...",Statistics,,20
1,"[Data Structure, Programming, Algorithms]",,ComputerScience,20


# json_normalize (data, record_path, meta, ...)
- data: dict or list of dict
- record_path : decode 해줘야할 열 지정 [{}, {}, {} ....]
- meta : decode 하는 열과 동일 차원에 존재하는 열들 중 데이터 프레임에 포함시킬 열 선택

In [102]:
# JSON exercise3
# from https://pandas.pydata.org/pandas-docs/stable/reference/api/\
#              pandas.io.json.json_normalize.html
data = [{'state': 'Florida', 
         'shortname': 'FL', 
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [103]:
type(data), len(data)

(list, 2)

In [104]:
pd.json_normalize(data)

Unnamed: 0,state,shortname,counties,info.governor
0,Florida,FL,"[{'name': 'Dade', 'population': 12345}, {'name...",Rick Scott
1,Ohio,OH,"[{'name': 'Summit', 'population': 1234}, {'nam...",John Kasich


In [21]:
pd.json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [105]:
pd.json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,state,shortname,info.governor
0,Dade,12345,Florida,FL,Rick Scott
1,Broward,40000,Florida,FL,Rick Scott
2,Palm Beach,60000,Florida,FL,Rick Scott
3,Summit,1234,Ohio,OH,John Kasich
4,Cuyahoga,1337,Ohio,OH,John Kasich


# HTML Parsing
- before you do this example, try to see and run some example HTML files which are in this directory

In [107]:
from bs4 import BeautifulSoup

In [108]:
html_text = """
<html>
<body>
  <h1> reading web page with python </h1>
     <p> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>
"""

In [109]:
soup = BeautifulSoup(html_text, 'html.parser')
soup


<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [110]:
type(soup)

bs4.BeautifulSoup

In [111]:
soup.h1

<h1> reading web page with python </h1>

In [112]:
soup.h1.string.strip()

'reading web page with python'

In [113]:
soup.p

<p> page analysis </p>

In [114]:
soup.p.next_sibling.next_sibling

<p> page alignment </p>

In [115]:
soup.td.next_sibling.next_sibling

<td><p>more text</p></td>

In [116]:
print(soup.td.next_sibling, soup.td.next_sibling.string)

<td></td> None


In [117]:
html_text2 = """
<html>
<body>
  <h1 id="title"> reading web page with python </h1>
     <p id="body"> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
     <ul>
         <li><a href = "http://www.naver.com"> naver</a></li>
         <li><a href = "http://www.daum.net"> daum</a></li>
     </ul>
  <div id="xxx">
    <h1> Wiki-books store </h1>
    <ul class="item">
      <li> introduction to game design </li>
      <li> introduction to python </li>
      <li> introduction to web design </li>
    </ul>
  </div>
</body>
</html>
"""

In [118]:
soup = BeautifulSoup(html_text2, 'html.parser')

In [119]:
soup


<html>
<body>
<h1 id="title"> reading web page with python </h1>
<p id="body"> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
<ul>
<li><a href="http://www.naver.com"> naver</a></li>
<li><a href="http://www.daum.net"> daum</a></li>
</ul>
<div id="xxx">
<h1> Wiki-books store </h1>
<ul class="item">
<li> introduction to game design </li>
<li> introduction to python </li>
<li> introduction to web design </li>
</ul>
</div>
</body>
</html>

### access by tags

In [120]:
soup.find(id='title')

<h1 id="title"> reading web page with python </h1>

In [121]:
soup.find(id='body').string

' page analysis '

In [122]:
soup.find_all('p')

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [123]:
soup.find_all('li')

[<li><a href="http://www.naver.com"> naver</a></li>,
 <li><a href="http://www.daum.net"> daum</a></li>,
 <li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [124]:
soup.find_all('li')[0]

<li><a href="http://www.naver.com"> naver</a></li>

In [125]:
soup.find_all('li')[0].string, soup.find_all('li')[0].attrs

(' naver', {})

In [126]:
soup.find_all('a')[0]

<a href="http://www.naver.com"> naver</a>

In [127]:
soup.find_all('a')[0].string, soup.find_all('a')[0].attrs

(' naver', {'href': 'http://www.naver.com'})

In [128]:
for aa in soup.find_all('a'):
    href = aa.attrs['href']
    text = aa.string
    print (text, "-->", href)

 naver --> http://www.naver.com
 daum --> http://www.daum.net


### access by regular expression

In [129]:
import re
soup.find_all(re.compile("^p"))   # tags starting with a character 'p'

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [130]:
soup.find_all(re.compile("div" ))

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [131]:
soup.find_all(href=re.compile("^http://"))

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

### access by css (Cascading Style Sheets) selector

In [132]:
soup.select('h1')    # by tags

[<h1 id="title"> reading web page with python </h1>,
 <h1> Wiki-books store </h1>]

In [133]:
soup.select('#xxx')  # by id

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [134]:
soup.select('.item') # by class name

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [135]:
soup.select('div .item')  # multi-components(tag=div, class=item)

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [136]:
soup.select_one("#xxx > ul > li")  # hierarchy (child)

<li> introduction to game design </li>

In [137]:
soup.select("div li")   # hierarchy (div tag >>> ul tag) (descendants)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [138]:
text = '<p class="body strikeout"></p>'

css_soup = BeautifulSoup(text, 'html.parser')
css_soup.find_all("p", class_="strikeout")  # can have multiple values for a class

[<p class="body strikeout"></p>]

In [139]:
css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

In [140]:
# If you want to search for tags that match two or more CSS classes, 
# you should use a CSS selector:
css_soup.select("p.body.strikeout") 

[<p class="body strikeout"></p>]

# Example from JOBKOREA
- newly added in 2022.9.21
- kr.indeed.com (search for 'data science', 'seoul', 'python')

In [141]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# search for 'data science', 'seoul', and 'python' in JobaKorea
url = 'https://www.jobkorea.co.kr/Search/?stext=data%20science&dkwrd=10001012776'
link = requests.get(url)
soup = BeautifulSoup(link.text, 'html.parser')

In [142]:
link.status_code

200

In [143]:
soup.select('.recruit-info')

[<div class="recruit-info" style="">
 <div class="list-filter-wrap">
 <p class="filter-text">총 <strong class="dev_tot">11</strong>건</p>
 <div class="btn-wrap">
 <button class="btn-list-filter pseudo-icn lazyBg blk-arw" type="button">정확도순</button>
 <div class="layer-comn layer-filter">
 <ul class="dev_list_filter" data-key="Ord">
 <li><button class="filter " data-val="ExactDesc" type="button">정확도순</button></li>
 <li><button class="filter " data-val="RegDtDesc" type="button">등록일순</button></li>
 <li><button class="filter " data-val="EditDtDesc" type="button">최신업데이트순</button></li>
 <li><button class="filter " data-val="ApplyCloseDtAsc" type="button">마감임박순</button></li>
 <li><button class="filter " data-val="ReadCntDesc" type="button">조회수순</button></li>
 <li><button class="filter " data-val="ApplicantDesc" type="button">지원자순</button></li>
 </ul>
 </div>
 </div>
 </div>
 <div class="lists">
 ﻿
 
 
 <div class="lists-cnt dev_list" total-count="11">
 <div class="list-default">
 <ul class="clea

In [144]:
len(soup.select('.recruit-info'))

1

In [145]:
ss = soup.select('.recruit-info')[0].select('.lists')
len(ss)

1

In [146]:
len(ss[0].select('.post'))

21

In [147]:
ss[0].select('.post')[10]

<div class="post">
<div class="post-list-corp">
<a class="name dev_view" href="/Recruit/GI_Read/39911204?Oem_Code=C1&amp;logpath=1&amp;stext=data science&amp;listno=11" nav-src="/Search/_ContentsGIRead?Gno=39911204&amp;Mem_Type_Code=C&amp;Mem_Sys_No=27127417" onclick="GA_Virtual_Dimension($(this).closest('.list-post[data-gno=39911204]').data('gainfo'));GA_Virtual('홈&gt;통합검색&gt;공고뷰', $(this).closest('.list-post[data-gno=39911204]').data('gavirturl'));GA_Event('통합검색_PC', '채용정보', '클릭');" title="지니너스㈜">지니너스㈜</a>
<button class="btn-fav pseudo-icn-old lazyBg heart dev_favor-27127417 " onclick="GA_Event('통합검색_PC', '채용정보','관심_' + ($(this).hasClass('on')?'해제':'등록'));" type="button">관심기업</button>
</div>
<div class="post-list-info">
<a class="title dev_view" href="/Recruit/GI_Read/39911204?Oem_Code=C1&amp;logpath=1&amp;stext=data science&amp;listno=11" nav-src="/Search/_ContentsGIRead?Gno=39911204&amp;Mem_Type_Code=C&amp;Mem_Sys_No=27127417" onclick="GA_Virtual_Dimension($(this).closest('.list-po

In [148]:
ss[0].select('.post')[0].select('.post-list-info')[0].a.text.strip()

'[모아소프트] AI/Data Science연구소 소프트웨어 개발인력 구인'

In [149]:
s0 = ss[0].select('.post')[0].select('.post-list-corp')
corp = s0[0].a.text
s0 = ss[0].select('.post')[0].select('.post-list-info')
title = s0[0].a.text.strip()
loc = s0[0].select('.loc.long')[0].text
date = s0[0].select('.date')[0].text
etc = s0[0].select('.etc')[0].text

In [150]:
corp, title, loc, date, etc

('㈜모아소프트',
 '[모아소프트] AI/Data Science연구소 소프트웨어 개발인력 구인',
 '서울 송파구',
 '상시채용',
 '소프트웨어개발, 솔루션, C, C++, Java, Python')

In [151]:
ss = soup.select('.recruit-info')[0].select('.lists')
post = ss[0].select('.post')
print("There are a total of {} job postings.".format(len(post)))

corps, titles, locs, dates, etcs = [], [], [], [], []
for i in range(len(post)):
    s0 = post[i].select('.post-list-corp')
    corps.append(s0[0].a.text)
    s0 = post[i].select('.post-list-info')
    titles.append(s0[0].a.text.strip())
    locs.append(s0[0].select('.loc.long')[0].text)
    dates.append(s0[0].select('.date')[0].text)
    etcs.append(s0[0].select('.etc')[0].text)

There are a total of 21 job postings.


In [152]:
pd.DataFrame(np.c_[corps, titles, locs, dates, etcs],
             columns=["corps", "titles", "locs", "dates", "etcs"])

Unnamed: 0,corps,titles,locs,dates,etcs
0,㈜모아소프트,[모아소프트] AI/Data Science연구소 소프트웨어 개발인력 구인,서울 송파구,상시채용,"소프트웨어개발, 솔루션, C, C++, Java, Python"
1,㈜엔씨소프트,[엔씨소프트]빅데이터 플랫폼 관리도구(웹) 개발자 모집,경기 성남시,~11/25(금),"프로그램개발, RDBMS, 3D온라인게임, 게임개발, 게임기획, 게임디자인, 게임소..."
2,지식시스템㈜ KSTEC,[KSTEC] Optimization & AI 기술융합 신입 컨설턴트 모집,경기 성남시 외,~11/30(수),"ERP, SCM, 시스템분석설계, APS, DB, 소프트웨어개발, 솔루션, 데이터베..."
3,지식시스템㈜ KSTEC,[KSTEC] Prescriptive Analytics 컨설턴트 신입 채용,경기 성남시 외,~11/30(수),"ERP, SCM, 시스템분석설계, APS, DB, 데이터베이스, DBA, 응용프로그..."
4,에타일렉트로닉스,"인공지능(AI), 데이터 사이언스 (알고리즘 개발) 팀원 모집",서울 용산구,상시채용,"전자회로, Firmware, Hardware, PCB, SMPS, Python, 인..."
5,㈜넷타겟,(주)넷타겟 각 부문별 수시채용,대전 유성구 외,상시채용,"바이오, 기획, 전략, 마케팅, 마케팅기획, 바이럴마케팅, 국내영업, 해외영업, 웹..."
6,"㈜크레버스(Creverse, Inc.)",[경력/정규직] 코딩 콘텐츠 기획 및 개발자 모집,서울 강남구,상시채용,"어학원, 컨텐츠개발, C, Java, Python, 컨텐츠개발, 프로그램강의"
7,SAS Software Korea Ltd Co,Recruitment Analytical Consultant (SAS),서울 서초구,상시채용,"SAS, 솔루션, BI, BigData, DataMining, DW, Python,..."
8,㈜폼즈,[신입/경력] python을 활용한 Ai개발 및 프론트 백앤드개발자 모집,서울 구로구 외,상시채용,"소프트웨어개발, Python"
9,지니너스㈜,지니너스㈜ 신입 및 경력 수시채용,서울 송파구,~02/06(월),"세포 배양, DNA, GMP, NGS, RNA Prep, 면역 세포 분리, 단일세포..."


# Exercise
- old example
- they changed the webpage and it no longer works. (2022/9)
- showing his code just for your reference

In [153]:
'''
# put altogether
url = 'https://kr.indeed.com/jobs?q=data+science&l=%EC%84%9C%EC%9A%B8%ED%8A%B9%EB%B3%84%EC%8B%9C'
link = requests.get(url)
soup = BeautifulSoup(link.text, 'html.parser')

job_elems = soup.select('.resultContent') # class

for i in job_elems:
    title = i.find('h2')
    company = i.find('span', class_='companyName')
    location = i.find('div', class_='companyLocation')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())
'''

"\n# put altogether\nurl = 'https://kr.indeed.com/jobs?q=data+science&l=%EC%84%9C%EC%9A%B8%ED%8A%B9%EB%B3%84%EC%8B%9C'\nlink = requests.get(url)\nsoup = BeautifulSoup(link.text, 'html.parser')\n\njob_elems = soup.select('.resultContent') # class\n\nfor i in job_elems:\n    title = i.find('h2')\n    company = i.find('span', class_='companyName')\n    location = i.find('div', class_='companyLocation')\n    \n    if None in (title, company, location):\n        continue\n        \n    print(title.text.strip())\n    print(company.text.strip())\n    print(location.text.strip())\n"

In [154]:
# .text and .string
html_text = """
<html>
<body>
  <td>some text</td>
  <td></td>
  <td><p>more text</p></td>
  <td>even <p>more text</p></td>"
</body>
</html>
"""
soup = BeautifulSoup(html_text, 'html.parser')

In [155]:
soup.find_all('td')

[<td>some text</td>,
 <td></td>,
 <td><p>more text</p></td>,
 <td>even <p>more text</p></td>]

In [156]:
for i in soup.find_all('td'):
    print(i.string)

some text
None
more text
None


In [157]:
for i in soup.find_all('td'):
    print(i.text)

some text

more text
even more text
