# JSON format

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import json

In [2]:
obj = """
{
    "name": "Kim",
    "places_lived": ["Seoul", "Korea"],
    "pet": null, 
    "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"}]
}
"""

In [3]:
type(obj)
obj

'\n{\n    "name": "Kim",\n    "places_lived": ["Seoul", "Korea"],\n    "pet": null, \n    "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"}]\n}\n'

In [4]:
r = json.loads(obj)   # decoding (json --> dict)
type(r)

dict

In [5]:
json.dumps(r)    # encoding (dict --> json)

'{"name": "Kim", "places_lived": ["Seoul", "Korea"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}]}'

## practical example
- naver real-time search ranking
- do not work any more since 2021.2

In [3]:
import requests
from pandas.io.json import json_normalize

r = requests.get('http://rank.search.naver.com/rank.js')

# pd.DataFrame(pd.DataFrame(pd.DataFrame(json.loads(r.text)).data).data)

jj = json_normalize(json.loads(r.text), record_path=['data', 'data'])

for i in range(len(jj)):
    ranks = (jj.iloc[i])['rank']
    title = (jj.iloc[i])['keyword']
    print(ranks, " : ", title)

# HTML Parsing
- before you do this example, try to see and run some example HTML files which are in this directory

In [None]:
# !pip install bs4

In [114]:
from bs4 import BeautifulSoup

In [115]:
html_text = """
<html>
<body>
  <h1> reading web page with python </h1>
     <p> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>
"""

In [116]:
soup = BeautifulSoup(html_text, 'html.parser')
soup


<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [117]:
soup.h1

<h1> reading web page with python </h1>

In [118]:
print(soup.p)
print(soup.p.next_sibling.next_sibling)
print(soup.td, soup.td.text, soup.td.string)
print(soup.td.next_sibling, soup.td.next_sibling.string)

<p> page analysis </p>
<p> page alignment </p>
<td>some text</td> some text some text
<td></td> None


In [119]:
html_text2 = """
<html>
<body>
  <h1 id="title"> reading web page with python </h1>
     <p id="body"> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
     <ul>
         <li><a href = "http://www.naver.com"> naver</a></li>
         <li><a href = "http://www.daum.net"> daum</a></li>
     </ul>
  <div id="xxx">
    <h1> Wiki-books store </h1>
    <ul class="item">
      <li> introduction to game design </li>
      <li> introduction to python </li>
      <li> introduction to web design </li>
    </ul>
  </div>
</body>
</html>
"""

In [120]:
soup = BeautifulSoup(html_text2, 'html.parser')

### access by tags

In [121]:
soup.find(id='title')

<h1 id="title"> reading web page with python </h1>

In [122]:
soup.find(id='body').string

' page analysis '

In [123]:
soup.find_all('p')

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [124]:
soup.find_all('li')[0]

<li><a href="http://www.naver.com"> naver</a></li>

In [125]:
soup.find_all('li')[0].string, soup.find_all('li')[0].attrs

(' naver', {})

In [126]:
soup.find_all('a')[0]

<a href="http://www.naver.com"> naver</a>

In [127]:
soup.find_all('a')[0].string, soup.find_all('a')[0].attrs

(' naver', {'href': 'http://www.naver.com'})

In [128]:
for aa in soup.find_all('a'):
    href = aa.attrs['href']
    text = aa.string
    print (text, "-->", href)

 naver --> http://www.naver.com
 daum --> http://www.daum.net


### access by regular expression

In [129]:
import re
soup.find_all(re.compile("^p"))   # tags starting with a character 'p'

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [130]:
soup.find_all(re.compile("div" ))

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [131]:
soup.find_all(href=re.compile("^http://"))

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

### access by css selector

In [132]:
soup.select('h1')    # by tags

[<h1 id="title"> reading web page with python </h1>,
 <h1> Wiki-books store </h1>]

In [133]:
soup.select('#xxx')  # by id

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [134]:
soup.select('.item') # by class name

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [135]:
soup.select('div .item')  # multi-components(tag=div, class=item)

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [35]:
soup.select_one("#xxx > ul > li")  # hierarchy (child)

<li> introduction to game design </li>

In [36]:
soup.select("div li")   # hierarchy (div tag >>> ul tag) (descendants)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [37]:
text = '<p class="body strikeout"></p>'

css_soup = BeautifulSoup(text, 'html.parser')
css_soup.find_all("p", class_="strikeout")  # can have multiple values for a class

[<p class="body strikeout"></p>]

In [38]:
css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

In [39]:
# If you want to search for tags that match two or more CSS classes, 
# you should use a CSS selector:
css_soup.select("p.body.strikeout") 

[<p class="body strikeout"></p>]

# practical example
- extract job information from www.monster.com
- they changed the web page and now it seems be be dynamic style. So, it does not return HTML source as you see in the web site.
- does not work any more.

In [153]:
url = 'https://www.monster.com/jobs/search?q=Data+Scientist&where=California'
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='SearchResults')
job_elems = results.find_all('section', class_='card-content')
len(job_elems)
job_elems[0]
print(job_elems[0].find('h2', class_='title').text.strip())
job_elems[0].find('div', class_='company').text.strip()
job_elems[0].find('div', class_='location').text.strip()

for i in job_elems:
    title = i.find('h2', class_='title')
    company = i.find('div', class_='company')
    location = i.find('div', class_='location')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())
    print()

## Another example from Indeed
- newly added in 2021.5.10
- kr.indeed.com (search for 'data science' in '서울특별시')

In [189]:
import requests
from bs4 import BeautifulSoup

# Create a soup object
url = 'https://kr.indeed.com/%EC%B7%A8%EC%97%85?q=data+science&l=%EC%84%9C%EC%9A%B8%ED%8A%B9%EB%B3%84%EC%8B%9C'
link = requests.get(url)
soup = BeautifulSoup(link.text, 'html.parser')

In [202]:
job_elems = soup.select('.jobsearch-SerpJobCard') # class
len(job_elems)

15

In [203]:
job_elems[0]

<div class="jobsearch-SerpJobCard unifiedRow row result" data-jk="4bd70fb8a220d90d" data-tn-component="organicJob" id="p_4bd70fb8a220d90d">
<h2 class="title">
<a class="jobtitle turnstileLink" data-tn-element="jobTitle" href="/rc/clk?jk=4bd70fb8a220d90d&amp;fccid=0bed8e17bc113980&amp;vjs=3" id="jl_4bd70fb8a220d90d" onclick="setRefineByCookie([]); return rclk(this,jobmap[0],true,0);" onmousedown="return rclk(this,jobmap[0],0);" rel="noopener nofollow" target="_blank" title="[Janssen] Data Scientist">
[Janssen] <b>Data</b> Scientist</a>
</h2>
<div class="sjcl">
<div>
<span class="company">
<a class="turnstileLink" data-tn-element="companyName" href="/cmp/Johnson-&amp;-Johnson" onmousedown="this.href = appendParamsOnce(this.href, 'from=SERP&amp;campaignid=serp-linkcompanyname&amp;fromjk=4bd70fb8a220d90d&amp;jcid=08849387e791ebc6')" rel="noopener" target="_blank">
Johnson &amp; Johnson Family of Companies</a></span>
<span class="ratingsDisplay">
<a class="ratingNumber" data-tn-variant="cmp

In [204]:
job_elems[0].find('h2', class_='title').text.strip()

'[Janssen] Data Scientist'

In [206]:
job_elems[0].find('span', class_='company').text.strip()

'Johnson & Johnson Family of Companies'

In [207]:
job_elems[0].find('span', class_='location').text.strip()

'서울'

In [208]:
for i in job_elems:
    title = i.find('h2', class_='title')
    company = i.find('span', class_='company')
    location = i.find('span', class_='location')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())
    print()

[Janssen] Data Scientist
Johnson & Johnson Family of Companies
서울

Data Science Intern
Ace I&M Consulting
서울 중구

[Data Science Div - Business Insights De...
PUBG
서울

[정규직] AI Data Scientist
SK 씨앤씨
서울

[Data Science Div - PUBG Studio Analytic...
PUBG
서울

2021 Market Data Analyst - Seoul
Bloomberg
서울

Data Scientist - IBM Garage
IBM
서울

Data Analyst
IQVIA
서울

[Data Science Div] Data Engineer
PUBG
서울

[Data Science Div - Business Insights] R...
PUBG
서울

[정규직] Data Scientist
SK 씨앤씨
서울

[Data Science Div] Data Platform 기획자
PUBG
서울

Data Analytics SA
Amazon Web Services Korea LLC
서울

Head of Analytics & Data Science (Coupan...
new
Coupang
서울

Market Research team Intern
IQVIA
서울

