# Web scraping with Python

### you will learn the basics of how to extract data from websites and visualize it using Python

### Python version

In [None]:
!python --version

### 1. Install packages

#### open a new terminal and install 'requests', 'beautiful soup' and 'pandas' by typing:
#### conda install -c anaconda requests
#### conda install -c anaconda beautifulsoup4 
#### conda install pandas

### 2. Import packages

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

import requests
import re

from bs4 import BeautifulSoup
from bs4 import __version__ as bs4__version__ 
'''
The Beautiful Soup package is used to parse the html, that is, take the raw html text and break it into Python objects
'''
import pandas as pd

import numpy as np # LET'S START WITH NUMPY!

In [None]:
print('requets: ' + requests.__version__, 'bs4: ' + bs4__version__,'pandas: ' + pd.__version__)

In [None]:
# url = 'https://www.federalreserve.gov/apps/fof/DisplayTable.aspx?t=f.105'
url = 'https://coinmarketcap.com/all/views/all/'
print(url)

In [None]:
res  = requests.get(url)
soup = BeautifulSoup(res.content, 'lxml') #The second argument 'lxml' is the html parser

In [None]:
print(soup)

#### You can use the find_all() method of soup to extract useful html tags within a webpage. Examples of useful tags include < a > for hyperlinks, < table > for tables, < tr > for table rows, < th > for table headers, and < td > for table cells. The code below shows how to extract all the hyperlinks within the webpage.

In [None]:
print(soup.find_all('table')[0])

In [None]:
table = soup.find_all('table')[0] 
df    = pd.read_html(str(table))[0]

In [None]:
df.head(5)

In [None]:
print(df['% 7d'].dtype)

In [None]:
df['% 7d']   = df['% 7d'].map(lambda x: x.strip('?%')).apply(pd.to_numeric, errors='coerce')

In [None]:
df['% 7d'].loc[:20].plot.bar()

In [None]:
df['% 7d'].describe()

### Homework - find the highest, lowest and average price

In [None]:
r    = requests.get("http://newyork.craigslist.org/search/aap")
r.raise_for_status()
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
price_spans = soup.select("span.result-price")
prices      = [int(span.text[1:]) for span in price_spans]
 
print('Highest price: ${}'.format(max(prices)))
print('Lowest price: ${}'.format(min(prices)))
print('Average price: ${}'.format(sum(prices)/len(prices)))

In [None]:
fig, ax = plt.subplots()
ax.plot(prices)

In [None]:
base_url = 'https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&SearchText='
query    = 'microcontroller'

r    = requests.get(base_url + query)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
price_spans = soup.find_all("div", attrs={"price"})
print(len(price_spans))

price_mins, price_maxs = [], []
for span in price_spans:
    min_max_price = [float(x) for x in re.findall('\$(\d+\.\d+)', span.text)]
    if len(min_max_price)==1:
        min_max_price.append(np.nan)
    price_mins.append( min_max_price[0] )
    price_maxs.append( min_max_price[1] )

In [None]:
min_spans = soup.find_all("div", attrs={"min-order"})
print(len(min_spans))

min_order = []
for span in min_spans:
    min_order.append( re.findall('\d+', span.text)[0]  )

In [None]:
name_spans = soup.find_all("h2", attrs={"title"})
print(len(name_spans))

company_names = []
for name in name_spans:
    company_names.append(name.text.strip())


In [None]:
href_spans = soup.find_all("div", attrs={"stitle util-ellipsis"})
print(len(href_spans))

company_hrefs = []
for href in href_spans:
    company_hrefs.append( re.findall( 'href="([^"]*)"', str(href.find_all('a')) )[0].strip() ) 
    

In [None]:
df = pd.DataFrame(np.array([company_names, company_hrefs, min_order, price_mins, price_maxs]).T,
             columns=['product_details','company_site','moq','price_min','price_max'])

for k, v in df.iteritems():
    if str(k) in ['moq','price_min','price_max']:
        df[k] = pd.to_numeric(df[k], errors='coerce')
        
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

df.style.format({'company_site': make_clickable})

In [None]:
df.price_max.plot()
df.price_min.plot()