<a href="https://colab.research.google.com/github/rtajeong/DSAC_Lab1_2/blob/master/code8_json_Web_scraping_rev2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# JSON format

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import json

In [8]:
obj = """
{
    "name": "Kim",
    "places_lived": ["Seoul", "Korea"],
    "pet": null, 
    "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"}]
}
"""

In [9]:
type(obj)
obj

'\n{\n    "name": "Kim",\n    "places_lived": ["Seoul", "Korea"],\n    "pet": null, \n    "siblings": [{"name": "Scott", "age":25, "pet":"Zuko"}]\n}\n'

In [10]:
r = json.loads(obj)   # decoding (json --> dict)
type(r)

dict

In [11]:
json.dumps(r)    # encoding (dict --> json)

'{"name": "Kim", "places_lived": ["Seoul", "Korea"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}]}'

## practical example
- naver real-time search ranking
- do not work any more since 2021.2

In [12]:
import requests
from pandas.io.json import json_normalize

r = requests.get('http://rank.search.naver.com/rank.js')

# pd.DataFrame(pd.DataFrame(pd.DataFrame(json.loads(r.text)).data).data)

jj = json_normalize(json.loads(r.text), record_path=['data', 'data'])

for i in range(len(jj)):
    ranks = (jj.iloc[i])['rank']
    title = (jj.iloc[i])['keyword']
    print(ranks, " : ", title)

ConnectionError: ignored

# HTML Parsing
- before you do this example, try to see and run some example HTML files which are in this directory

In [None]:
# !pip install bs4

In [13]:
from bs4 import BeautifulSoup

In [14]:
html_text = """
<html>
<body>
  <h1> reading web page with python </h1>
     <p> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>
"""

In [15]:
soup = BeautifulSoup(html_text, 'html.parser')
soup


<html>
<body>
<h1> reading web page with python </h1>
<p> page analysis </p>
<p> page alignment </p>
<td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
</body>
</html>

In [16]:
soup.h1

<h1> reading web page with python </h1>

In [17]:
print(soup.p)
print(soup.p.next_sibling.next_sibling)
print(soup.td, soup.td.text, soup.td.string)
print(soup.td.next_sibling, soup.td.next_sibling.string)

<p> page analysis </p>
<p> page alignment </p>
<td>some text</td> some text some text
<td></td> None


In [18]:
html_text2 = """
<html>
<body>
  <h1 id="title"> reading web page with python </h1>
     <p id="body"> page analysis </p>
     <p> page alignment </p>
     <td>some text</td><td></td><td><p>more text</p></td><td>even <p>more text</p></td>
     <ul>
         <li><a href = "http://www.naver.com"> naver</a></li>
         <li><a href = "http://www.daum.net"> daum</a></li>
     </ul>
  <div id="xxx">
    <h1> Wiki-books store </h1>
    <ul class="item">
      <li> introduction to game design </li>
      <li> introduction to python </li>
      <li> introduction to web design </li>
    </ul>
  </div>
</body>
</html>
"""

In [19]:
soup = BeautifulSoup(html_text2, 'html.parser')

### access by tags

In [20]:
soup.find(id='title')

<h1 id="title"> reading web page with python </h1>

In [21]:
soup.find(id='body').string

' page analysis '

In [22]:
soup.find_all('p')

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [23]:
soup.find_all('li')[0]

<li><a href="http://www.naver.com"> naver</a></li>

In [24]:
soup.find_all('li')[0].string, soup.find_all('li')[0].attrs

(' naver', {})

In [25]:
soup.find_all('a')[0]

<a href="http://www.naver.com"> naver</a>

In [26]:
soup.find_all('a')[0].string, soup.find_all('a')[0].attrs

(' naver', {'href': 'http://www.naver.com'})

In [27]:
for aa in soup.find_all('a'):
    href = aa.attrs['href']
    text = aa.string
    print (text, "-->", href)

 naver --> http://www.naver.com
 daum --> http://www.daum.net


### access by regular expression

In [28]:
import re
soup.find_all(re.compile("^p"))   # tags starting with a character 'p'

[<p id="body"> page analysis </p>,
 <p> page alignment </p>,
 <p>more text</p>,
 <p>more text</p>]

In [29]:
soup.find_all(re.compile("div" ))

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [30]:
soup.find_all(href=re.compile("^http://"))

[<a href="http://www.naver.com"> naver</a>,
 <a href="http://www.daum.net"> daum</a>]

### access by css selector

In [31]:
soup.select('h1')    # by tags

[<h1 id="title"> reading web page with python </h1>,
 <h1> Wiki-books store </h1>]

In [32]:
soup.select('#xxx')  # by id

[<div id="xxx">
 <h1> Wiki-books store </h1>
 <ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>
 </div>]

In [33]:
soup.select('.item') # by class name

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [34]:
soup.select('div .item')  # multi-components(tag=div, class=item)

[<ul class="item">
 <li> introduction to game design </li>
 <li> introduction to python </li>
 <li> introduction to web design </li>
 </ul>]

In [35]:
soup.select_one("#xxx > ul > li")  # hierarchy (child)

<li> introduction to game design </li>

In [36]:
soup.select("div li")   # hierarchy (div tag >>> ul tag) (descendants)

[<li> introduction to game design </li>,
 <li> introduction to python </li>,
 <li> introduction to web design </li>]

In [37]:
text = '<p class="body strikeout"></p>'

css_soup = BeautifulSoup(text, 'html.parser')
css_soup.find_all("p", class_="strikeout")  # can have multiple values for a class

[<p class="body strikeout"></p>]

In [38]:
css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

In [39]:
# If you want to search for tags that match two or more CSS classes, 
# you should use a CSS selector:
css_soup.select("p.body.strikeout") 

[<p class="body strikeout"></p>]

# practical example
- extract job information from www.monster.com
- they changed the web page and now it seems be be dynamic style. So, it does not return HTML source as you see in the web site.
- does not work any more.

In [40]:
url = 'https://www.monster.com/jobs/search?q=Data+Scientist&where=California'
page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='SearchResults')
job_elems = results.find_all('section', class_='card-content')
len(job_elems)
job_elems[0]
print(job_elems[0].find('h2', class_='title').text.strip())
job_elems[0].find('div', class_='company').text.strip()
job_elems[0].find('div', class_='location').text.strip()

for i in job_elems:
    title = i.find('h2', class_='title')
    company = i.find('div', class_='company')
    location = i.find('div', class_='location')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())
    print()

AttributeError: ignored

## Another example from Indeed
- newly added in 2021.7.15
- kr.indeed.com (search for 'data science' in '서울특별시')

In [86]:
import requests
from bs4 import BeautifulSoup

# Create a soup object
url = 'https://kr.indeed.com/jobs?q=data%20science&l=%EC%84%9C%EC%9A%B8%ED%8A%B9%EB%B3%84%EC%8B%9C'
link = requests.get(url)
soup = BeautifulSoup(link.text, 'html.parser')

In [87]:
job_elems = soup.select('.resultContent') # class
len(job_elems)

15

In [95]:
print(job_elems[1])

<td class="resultContent"><div class="heading4 color-text-primary singleLineTitle tapItem-gutter"><h2 class="jobTitle jobTitle-color-purple"><span title="Analyst, Data Science">Analyst, Data Science</span></h2></div><div class="heading6 company_location tapItem-gutter"><pre><span class="companyName"><a class="turnstileLink companyOverviewLink" data-tn-element="companyName" href="/cmp/Nielsen" rel="noopener" target="_blank">Nielsen</a></span><span class="ratingsDisplay withRatingLink"><a class="ratingLink" data-tn-variant="cmplinktst2" href="/cmp/Nielsen/reviews" rel="noopener" target="_blank" title="Nielsen reviews"><span aria-label="3.8 of stars rating" class="ratingNumber" role="img"><span aria-hidden="true">3.8</span><svg aria-hidden="true" class="starIcon" fill="none" height="12" viewbox="0 0 16 16" width="12" xmlns="http://www.w3.org/2000/svg"><path d="M8 12.8709L12.4542 15.5593C12.7807 15.7563 13.1835 15.4636 13.0968 15.0922L11.9148 10.0254L15.8505 6.61581C16.1388 6.36608 15.9847

In [90]:
job_elems[0].find('h2', class_='jobTitle').text.strip()
job_elems[0].find('h2').text.strip()

'[Data Science Div - Business Insights Dept] Business Data An...'

In [91]:
job_elems[0].find('span', class_='companyName').text.strip()

'PUBG'

In [92]:
job_elems[0].find('div', class_='companyLocation').text.strip()

'서울'

In [94]:
for i in job_elems:
    # title = i.find('h2', class_='jobTitle')
    title = i.find('h2')
    company = i.find('span', class_='companyName')
    location = i.find('div', class_='companyLocation')
    
    if None in (title, company, location):
        continue
        
    print(title.text.strip())
    print(company.text.strip())
    print(location.text.strip())
    print()

[Data Science Div - Business Insights Dept] Business Data An...
PUBG
서울

Analyst, Data Science
Nielsen
서울

Data Science Engineer
글로컬라이즈
서울 강남구

[Data Science Div] Data Engineer
PUBG
서울

newTech Sales for Data & AI
IBM
서울

Data Scientist
Boston Consulting Group
서울

2021 Market Data Analyst - Seoul
Bloomberg
서울

Senior Analyst - Data Science, Marketing & Sales
McKinsey & Company
서울

Data Analyst
IQVIA
서울

newMedical Science Partner
Roche
서울

Data Center Industry Technical Specialist
Intel
서울

[Data Science Div - Business Insights] Researcher
PUBG
서울

[Data Science Div - Mobile Analytics Dept] Data Analyst
PUBG
서울

[Janssen] Sr. Medical Science Liaison - Multiple Myeloma
Johnson & Johnson Family of Companies
서울

Medical Science Liaison (1Y contract)
GSK
서울

