In [1]:
# Note you may have to install requests!  pip3 install requests

import requests
# These two things are for Pandas, it widens the notebook and lets us display data easily.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

## Simple API Call with Requests Library

It may be good to look at the reference documentation for the [requests library](https://2.python-requests.org/en/master/user/quickstart/).

First, let's have a look at the [GitHub API](https://developer.github.com/v3/).

In [2]:
r = requests.get('https://api.github.com/users/nmattei', timeout=10)
r.status_code

200

In [3]:
r.headers['content-type']

'application/json; charset=utf-8'

In [4]:
r.url

'https://api.github.com/users/nmattei'

In [5]:
r.content

b'{"login":"nmattei","id":1206578,"node_id":"MDQ6VXNlcjEyMDY1Nzg=","avatar_url":"https://avatars.githubusercontent.com/u/1206578?v=4","gravatar_id":"","url":"https://api.github.com/users/nmattei","html_url":"https://github.com/nmattei","followers_url":"https://api.github.com/users/nmattei/followers","following_url":"https://api.github.com/users/nmattei/following{/other_user}","gists_url":"https://api.github.com/users/nmattei/gists{/gist_id}","starred_url":"https://api.github.com/users/nmattei/starred{/owner}{/repo}","subscriptions_url":"https://api.github.com/users/nmattei/subscriptions","organizations_url":"https://api.github.com/users/nmattei/orgs","repos_url":"https://api.github.com/users/nmattei/repos","events_url":"https://api.github.com/users/nmattei/events{/privacy}","received_events_url":"https://api.github.com/users/nmattei/received_events","type":"User","site_admin":false,"name":"Nicholas Mattei","company":"Tulane University","blog":"http://www.nickmattei.net","location":null

In [6]:
r.json()

{'login': 'nmattei',
 'id': 1206578,
 'node_id': 'MDQ6VXNlcjEyMDY1Nzg=',
 'avatar_url': 'https://avatars.githubusercontent.com/u/1206578?v=4',
 'gravatar_id': '',
 'url': 'https://api.github.com/users/nmattei',
 'html_url': 'https://github.com/nmattei',
 'followers_url': 'https://api.github.com/users/nmattei/followers',
 'following_url': 'https://api.github.com/users/nmattei/following{/other_user}',
 'gists_url': 'https://api.github.com/users/nmattei/gists{/gist_id}',
 'starred_url': 'https://api.github.com/users/nmattei/starred{/owner}{/repo}',
 'subscriptions_url': 'https://api.github.com/users/nmattei/subscriptions',
 'organizations_url': 'https://api.github.com/users/nmattei/orgs',
 'repos_url': 'https://api.github.com/users/nmattei/repos',
 'events_url': 'https://api.github.com/users/nmattei/events{/privacy}',
 'received_events_url': 'https://api.github.com/users/nmattei/received_events',
 'type': 'User',
 'site_admin': False,
 'name': 'Nicholas Mattei',
 'company': 'Tulane Univer

## Looking at HTTP Requests

We'll try to get some data from Google.  Note that this is kind of against the TOS and we **should not do it this way in general -- Google has very [specific rules on their site](https://developers.google.com/custom-search/v1/).**

In [7]:
params = {'q':'Tulane University'}
r = requests.get('http://www.google.com/search', params = params, timeout=10)
r.status_code

200

In [8]:
r.url

'http://www.google.com/search?q=Tulane+University'

In [9]:
r.headers['content-type']

'text/html; charset=ISO-8859-1'

In [10]:
r.text

'<!doctype html><html lang="en"><head><meta charset="UTF-8"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Tulane University - Google Search</title><script nonce="pAHkKtwpFpazOGYlOWm/Zw==">(function(){\ndocument.documentElement.addEventListener("submit",function(b){var a;if(a=b.target){var c=a.getAttribute("data-submitfalse");a="1"===c||"q"===c&&!a.elements.q.value?!0:!1}else a=!1;a&&(b.preventDefault(),b.stopPropagation())},!0);document.documentElement.addEventListener("click",function(b){var a;a:{for(a=b.target;a&&a!==document.documentElement;a=a.parentElement)if("A"===a.tagName){a="1"===a.getAttribute("data-nohref");break a}a=!1}a&&b.preventDefault()},!0);}).call(this);(function(){\nvar a=window.performance;window.start=Date.now();a:{var b=window;if(a){var c=a.timing;if(c){var d=c.navigationStart,f=c.responseStart;if(f>d&&f<=window.start){window.start=f;b.wsrt=f-d;break a}}a.now&&(b.wsrt=Math.floor(a.now()))}}window.google=window

## More Complicated with Parameters

We'll look for some information from the [Apple ITunes API](https://affiliate.itunes.apple.com/resources/documentation/itunes-store-web-service-search-api/).

In [26]:
params = {'term' : "the+meters"}
r = requests.get('https://itunes.apple.com/search', params=params, timeout=10)
r.status_code

200

In [27]:
r.url

'https://itunes.apple.com/search?term=the%2Bmeters'

In [28]:
r.json()

{'resultCount': 50,
 'results': [{'wrapperType': 'track',
   'kind': 'song',
   'artistId': 7314214,
   'collectionId': 59401239,
   'trackId': 59401193,
   'artistName': 'The Meters',
   'collectionName': 'The Meters',
   'trackName': 'Cissy Strut',
   'collectionCensoredName': 'The Meters',
   'trackCensoredName': 'Cissy Strut',
   'artistViewUrl': 'https://music.apple.com/us/artist/the-meters/7314214?uo=4',
   'collectionViewUrl': 'https://music.apple.com/us/album/cissy-strut/59401239?i=59401193&uo=4',
   'trackViewUrl': 'https://music.apple.com/us/album/cissy-strut/59401239?i=59401193&uo=4',
   'previewUrl': 'https://audio-ssl.itunes.apple.com/itunes-assets/AudioPreview125/v4/2e/9b/85/2e9b8549-cea1-f64d-48b6-05770ec0a692/mzaf_5938744141518605358.plus.aac.p.m4a',
   'artworkUrl30': 'https://is1-ssl.mzstatic.com/image/thumb/Music124/v4/9f/be/8a/9fbe8a82-b711-2594-c77d-c09c36f31598/source/30x30bb.jpg',
   'artworkUrl60': 'https://is1-ssl.mzstatic.com/image/thumb/Music124/v4/9f/be/8a/9

In [29]:
r.url

'https://itunes.apple.com/search?term=the%2Bmeters'

We can do lots of parameters in the payload like [this](https://2.python-requests.org/en/master/user/quickstart/).

In [30]:
params = {'term' : "the+meters", 'entity' : 'album'}
r = requests.get('https://itunes.apple.com/search', params=params, timeout=10)
r.status_code


200

In [31]:
r.url

'https://itunes.apple.com/search?term=the%2Bmeters&entity=album'

In [32]:
x = r.json()

In [33]:
type(x['results'][0])

dict

## Converting the returned JSON to an object!

In [34]:
import json

In [35]:
data = json.loads(r.content)

In [36]:
data.keys()

dict_keys(['resultCount', 'results'])

In [37]:
type(data['results'])

list

In [38]:
type(data['results'][1])

dict

In [39]:
data['results'][1]

{'wrapperType': 'collection',
 'collectionType': 'Album',
 'artistId': 7314214,
 'collectionId': 213532006,
 'amgArtistId': 4907,
 'artistName': 'The Meters',
 'collectionName': 'The Very Best of The Meters',
 'collectionCensoredName': 'The Very Best of The Meters',
 'artistViewUrl': 'https://music.apple.com/us/artist/the-meters/7314214?uo=4',
 'collectionViewUrl': 'https://music.apple.com/us/album/the-very-best-of-the-meters/213532006?uo=4',
 'artworkUrl60': 'https://is1-ssl.mzstatic.com/image/thumb/Music/v4/ec/27/68/ec2768aa-ca30-a3d1-b3a5-f485d6e0ec35/source/60x60bb.jpg',
 'artworkUrl100': 'https://is1-ssl.mzstatic.com/image/thumb/Music/v4/ec/27/68/ec2768aa-ca30-a3d1-b3a5-f485d6e0ec35/source/100x100bb.jpg',
 'collectionPrice': 9.99,
 'collectionExplicitness': 'notExplicit',
 'trackCount': 16,
 'copyright': '℗ 2005 Warner Strategic Marketing',
 'country': 'USA',
 'currency': 'USD',
 'releaseDate': '2005-03-29T08:00:00Z',
 'primaryGenreName': 'R&B/Soul'}

In [40]:
data['results'][1].keys()

dict_keys(['wrapperType', 'collectionType', 'artistId', 'collectionId', 'amgArtistId', 'artistName', 'collectionName', 'collectionCensoredName', 'artistViewUrl', 'collectionViewUrl', 'artworkUrl60', 'artworkUrl100', 'collectionPrice', 'collectionExplicitness', 'trackCount', 'copyright', 'country', 'currency', 'releaseDate', 'primaryGenreName'])

## Using Beautiful Soup to Parse a Webpage.

The [beautifulsoup4 documentation](https://www.crummy.com/software/BeautifulSoup/).

In [41]:
# Grab the course webpage.
import requests
from bs4 import BeautifulSoup

r = requests.get('https://nmattei.github.io/cmps3160/schedule/')

root = BeautifulSoup( r.content )

In [42]:
r.content

b'<!DOCTYPE html>\n<html lang="en">\n  <!-- Beautiful Jekyll | MIT license | Copyright Dean Attali 2016 -->\n  <head>\n  <meta charset="utf-8" />\n  <meta http-equiv="X-UA-Compatible" content="IE=edge">\n  <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, viewport-fit=cover">\n\n  <title>Fall 2021 Schedule</title>\n\n  <meta name="author" content="Nicholas Mattei" />\n\n  \n\n  <link rel="alternate" type="application/rss+xml" title="CMPS 3160 Intro. to Data Science - Intro to Data Science - Fall 2020" href="https://nmattei.github.io/cmps3160/feed.xml" />\n\n  \n\n  \n\n  \n\n\n  \n    \n      \n  <link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/font-awesome/4.6.0/css/font-awesome.min.css" />\n\n\n    \n  \n\n  \n    \n      <link rel="stylesheet" href="/cmps3160/css/bootstrap.min.css" />\n    \n      <link rel="stylesheet" href="/cmps3160/css/bootstrap-social.css" />\n    \n      <link rel="stylesheet" href="/cmps3160/css/main.css" />\n    \

In [43]:
root.find("table")

In [44]:
root.find("table").findAll("a")

AttributeError: 'NoneType' object has no attribute 'findAll'

## Trying out some Regular Expressions.

In [45]:
import re
# Find the index in the raw HTML where we first mention CMPS3160

# Note we use the r to make sure special flags get used correctly.

r = requests.get('https://nmattei.github.io/cmps3160/syllabus/')
match = re.search(r'CMPS 3160', r.text)
print(match.start())

460


In [46]:
r.text[390:500]

'ei" />\n\n  \n\n  <link rel="alternate" type="application/rss+xml" title="CMPS 3160 Intro. to Data Science - Intro'

In [None]:
# Does the start match?
match = re.match(r'CMPS 3160', r.text)
print(match)

In [None]:
# Iterate over all occurances and print a few characters.
for m in re.finditer(r'CMPS 3160', r.text):
    print(r.text[m.start()-50:m.start()+50])


In [None]:
# Find them all.
match = re.findall(r'CMPS 3160', r.text)
print(match)

In [None]:
# More complicated RegExes - Groups
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(\w{3,})'

text = ''' The university of kentucky is the best
            basketball team and an ok university. and University of North CC
            The University Of Kentucky can be put in 
            some weird capitalization and University of Ken spelled wrong'''
m = re.search( regex, text)
print(m.groups())

In [None]:
# Find all
print(re.findall(regex, text))

In [None]:
# Named Groups.
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(?P<school>\w{3,})'
text = ''' The university of kentucky is the best University of Lousiana
            basketball team and an ok university.
            The University Of Kentucky can be put in 
            some weird capitalization'''
m = re.search( regex, text)
print(m.groupdict())


In [None]:
# Find all named groups

# Named Groups.
regex = r'\s*([Uu]niversity)\s([Oo]f)\s(?P<school>\w{3,})'
text = ''' The university of kentucky is the best
            basketball team and an ok university.
            The University Of Kentucky can be put in 
            some weird capitalization.  And Kentucky is much better than
            the University of Mississippi.'''
for m in re.finditer(regex, text):
    print(m.groupdict())


In [None]:
'abcabcabc'.replace('a', 'X')

In [None]:
text = 'I love Introduction to Data Science'
re.sub(r'Data Science', r'Schmada Schmience', text) 

In [None]:
re.sub(r'(\w+)\s([Ss]cience)', r'\2 \1hmience', text) 


## Downloadning All the ... PDFs from the course website.

Using beautiful soup and some regular expressions.

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
import os
import pathlib

In [None]:
# HTTP GET request sent to the URL url
# We're going to use last year's website as it's got direct links...
r = requests.get( "https://nmattei.github.io/cmps3160/schedule/" )

# Use BeautifulSoup to parse the GET response
root = BeautifulSoup( r.content )
lnks = root.find("table").findAll("a")
lnks

Let's do the easier one first and download all the `.ipynb` from the webpage.  We'll get into why this is easier in a second...

In [None]:
# Cycle through the href for each anchor, checking
# to see if it's an ipynb link or not
notebooks = []
for lnk in lnks:
    href = lnk['href']
    # If it's a PDF/PPTX link, queue a download   
    if href.lower().endswith(('ipynb')):
        notebooks.append(href)
        print("{} is a Link to {}".format(lnk.contents,lnk['href']))
print(notebooks)

In [None]:
# Download all the files to whatever you're running notebook from.

# Be careful for href!

for i, href in enumerate(notebooks):
    print("Downloading... {}".format(href))
    rd = requests.get(href, stream=True)
    
    # Write the downloaded object to a file -- first we should make a directory for it..
    outputdir = os.path.join(os.getcwd(), "downloaded")
    os.makedirs(outputdir, exist_ok=True)
    
    # Note because the href is a path we have to just get the filename!
    outfile = os.path.join(outputdir, href.split("/")[-1])
    print("Writing: ",outfile)
    with open(outfile, 'wb') as f:
        f.write(rd.content)


Let's do this more complicated and try to grab all the PDF's...

First thing to note is that the PDFs have it in the name but not the target and they're hosted on GOOGLE! -- so this doesn't really work :-(


In [None]:
# We can go check, we get a google drive directory...

r = requests.get( "https://drive.google.com/drive/u/1/folders/1uGrhWzhXbiqoChTK0fQXg340X319REks" )

# Use BeautifulSoup to parse the GET response
root = BeautifulSoup( r.content )
#lnks = root.find("table").findAll("a")
#lnks
root.content



In [None]:
# We have all google links so we need to check the tags to see if they contain PDF!
pdfs = []
for lnk in lnks:
    if 'pdf' in lnk.contents[0].lower():
        print("{} is a PDF Link to {}".format(lnk.contents,lnk['href']))
        pdfs.append(lnk['href'])
print(pdfs)

In [None]:
# Note that google doens't make this easy... sorry, you have to do a little kung fu...
# Format is: https://drive.google.com/u/1/uc?id=ID&export=download
download_links = []
for c in pdfs:
    fid = c.split("/")[-2]
    download_links.append("https://drive.google.com/u/1/uc?id={}&export=download".format(fid))
print(download_links)
