# Using python to access web data (Excersices)

### 1.- Retrieve data using requests

In [79]:
import requests
import re

def count_numbers(text):
    return sum([int(i) for i in re.findall('[0-9]+',text)])

text = requests.get("https://py4e-data.dr-chuck.net/regex_sum_42.txt")
print(count_numbers(text.text))

text = requests.get("https://py4e-data.dr-chuck.net/regex_sum_383060.txt")
print(count_numbers(text.text))

445833
414217


### 2.- Using a socket to request data

In [80]:
import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    print(data.decode(),end='')

mysock.close()


HTTP/1.1 200 OK
Date: Mon, 20 Apr 2020 18:40:44 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "1d3-54f6609240717"
Accept-Ranges: bytes
Content-Length: 467
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

Why should you learn to write programs?

Writing programs (or programming) is a very creative 
and rewarding activity.  You can write programs for 
many reasons, ranging from making your living to solving
a difficult data analysis problem to having fun to helping
someone else solve a problem.  This book assumes that 
everyone needs to know how to program, and that once 
you know how to program you will figure out what you want 
to do with your newfound skills.  


In [81]:
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file

from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")

# Retrieve all of the span tags,searcd and print numbers
def count_numbers(text):
    return sum([int(i) for i in re.findall('[0-9]+',text)])

tags = soup('span')
count = 0
for tag in tags:
    count+= count_numbers(tag.decode())
print(count)

2553


### 3.- Using urllib to retrieve data and beautiful sour to parse html data

In [84]:
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file

from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = 'http://py4e-data.dr-chuck.net/comments_383062.html'
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")

# Retrieve all of the span tags,searcd and print numbers
def count_numbers(text):
    return sum([int(i) for i in re.findall('[0-9]+',text)])

tags = soup('span')
count = 0
for tag in tags:
    count+= count_numbers(tag.decode())
print(count)

2441


In [85]:
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def follow_url(url):

    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')

    # Retrieve all of the anchor tags
    tags = soup('a')
    return tags[2].get('href', None)

url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
for i in range(4):
    url = follow_url(url)

print(re.findall('by_(.+).html',url)[0])


Anayah


In [86]:
# To run this, download the BeautifulSoup zip file
# http://www.py4e.com/code3/bs4.zip
# and unzip it in the same directory as this file

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import re

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

def follow_url(url):

    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')

    # Retrieve all of the anchor tags
    tags = soup('a')
    return tags[17].get('href', None)

url = 'http://py4e-data.dr-chuck.net/known_by_Charleigh.html'
for i in range(7):
    url = follow_url(url)

print(re.findall('by_(.+).html',url)[0])

Maaz


### 4.- User urllib to request xml and parse it

In [87]:
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl



# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:

    url =  'http://py4e-data.dr-chuck.net/comments_42.xml'
    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)

    data = uh.read()
    print('Retrieved', len(data), 'characters')
    #print(data.decode())
    tree = ET.fromstring(data)

    break

lst = tree.findall('comments/comment')
print("Count: ",len(lst))

print(sum(int(item.find('count').text) for item in lst))
    
    


Retrieving http://py4e-data.dr-chuck.net/comments_42.xml
Retrieved 4189 characters
Count:  50
2553


In [88]:
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl



# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:

    url =  'http://py4e-data.dr-chuck.net/comments_383064.xml'
    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)

    data = uh.read()
    print('Retrieved', len(data), 'characters')
    #print(data.decode())
    tree = ET.fromstring(data)

    break

lst = tree.findall('comments/comment')
print("Count: ",len(lst))

print("Sum: {}".format( sum(int(item.find('count').text) for item in lst)) )
    
    

Retrieving http://py4e-data.dr-chuck.net/comments_383064.xml
Retrieved 4242 characters
Count:  50
Sum: 2392


### 5.- Retriving a json with urllib and extract data

In [92]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:

    url =  'http://py4e-data.dr-chuck.net/comments_42.json'
    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)

    data = uh.read().decode()

    info = json.loads(data)

    break

count = 0
for comment in info['comments']:
    count+=comment['count']
print(count)
    
    

Retrieving http://py4e-data.dr-chuck.net/comments_42.json
2553


In [90]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:

    url =  'http://py4e-data.dr-chuck.net/comments_383065.json'
    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)

    data = uh.read().decode()

    info = json.loads(data)

    break

count = 0
for comment in info['comments']:
    count+=comment['count']
print(count)

Retrieving http://py4e-data.dr-chuck.net/comments_383065.json
2369


### 6.- Retriving data from API

In [93]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl

api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro

if api_key is False:
    api_key = 42
    serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
    serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:
    address = 'University of Florida'#input('Enter location: ')
    if len(address) < 1: break

    parms = dict()
    parms['address'] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)

    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    print('Retrieved', len(data), 'characters')

    try:
        js = json.loads(data)
    except:
        js = None

    if not js or 'status' not in js or js['status'] != 'OK':
        print('==== Failure To Retrieve ====')
        print(data)
        continue

    #print(json.dumps(js, indent=4))

    lat = js['results'][0]['geometry']['location']['lat']
    lng = js['results'][0]['geometry']['location']['lng']
    print('lat', lat, 'lng', lng)
    location = js['results'][0]['formatted_address']
    for result in js['results']:
        print('->>>', result['place_id'])
    break

Retrieving http://py4e-data.dr-chuck.net/json?address=University+of+Florida&key=42
Retrieved 1820 characters
lat 29.6436325 lng -82.3549302
->>> ChIJdZLk-wyj6IgRhpg9FN7gbyA
