## API

Today, we'll continue with the dictionaries lecture from last week.
one of the most popular forms of python dictionaries you'll run across as you work is __JSON__ ([JavaScript Object Notation](https://www.json.org/json-en.html)). JSON is the primary format in which data is passed back and forth to APIs ([Application Programming Interface](https://medium.com/@perrysetgo/what-exactly-is-an-api-69f36968a41f)), and most API servers will send their responses in JSON format. Python has great JSON support, with the json package. We'll play around with a snippet of a call result from the [MET Museum API]('https://metmuseum.github.io/').

In [None]:
# import requests and json - needed to make an API call
import requests
import json

In [None]:
# this is the api endpoint to get a JSON of each department info
dept_api_url = 'https://collectionapi.metmuseum.org/public/collection/v1/departments'

In [None]:
# calling the API with requests
response = requests.get(dept_api_url)
# creating a variable called data to hold the json formatted result
department_data = response.json()

In [None]:
print(type(department_data))
department_data

In [None]:
# from the JSON above we only need the list with all department "dictionaries" in them
# we'll store this in a variable called departments
departments = department_data['departments']
departments

In [None]:
# once we have the department information, we can examine how many objects there are in 
# total for a particular department.
# this api endpoint also provides a list of object ids which we'll use later to get images
# here we are examining the first department, which is American Decorative Arts
obj_by_dept_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects?departmentIds=" + str(1) + '&q=cat'
response = requests.get(obj_by_dept_url)
obj_data = response.json()
print(obj_data)

In [None]:
# we can get the info for all the departments by looping
for department in departments:
    department_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects?departmentIds=" + str(department['departmentId']) + '&q=cat'
    response = requests.get(department_url)
    data = response.json()
    print(data)

In [None]:
# we can create a list with objects per deparment so that the name of the department
# and the total number of objects are grouped together
dept_obj_total = []

for department in departments:
    obj = {}
    obj['title'] = department['displayName']
    dept_obj_total.append(obj)
    
    department_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects?departmentIds=" + str(department['departmentId']) + '&q=cat'
    response = requests.get(department_url)
    data = response.json()
    obj['total_objects'] = data['total']
    
print(dept_obj_total)

### Let's visualize this data:

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# plt takes in lists for x, y values
# getting a list of total # of objects per department for our y values
# and a list of all the department names for our x values
total_num = []
dept_names = []
for dept_obj in dept_obj_total:
    total_num.append(dept_obj['total_objects'])
    dept_names.append(dept_obj['title'])

# using numpy to calculate the color so it changes by bar height
y = np.array(total_num)    
my_cmap = plt.get_cmap("Pastel2")
rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))

# setting the figure size
plt.figure(figsize=(20,10))
plt.bar(range(len(dept_obj_total)), total_num, color=my_cmap(rescale(y)), align="center")
plt.xticks(range(len(dept_obj_total)), dept_names, rotation=45, ha="right", fontsize=12)
plt.show()

### Inspecting images

In [None]:
# we are going to get images from {'departmentId': 11, 'displayName': 'European Paintings'}
# first we call the api endpoint for the objects
# and store the JSON response in a variable
ep_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects?departmentIds=" + str(11) + '&q=cat'
response = requests.get(ep_url)
ep_data = response.json()

In [None]:
# inspect our JSON data
ep_data

In [None]:
# we are going to look at the first 10 images
# we slice through from 0: 10 -> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
# and store the 10 object ids in a variable
ep_objects = ep_data['objectIDs'][0: 10]
ep_objects

In [None]:
# let's see what a single object JSON looks like:
obj_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects/436199"
response = requests.get(obj_url)
obj_data = response.json()
obj_data

In [None]:
# now that we know that the images are in the primaryImage key, we are going to iterate through our list of 10
# image objects and store the urls in a new list
img_urls = []

for obj in ep_objects:
    # making an api call for each objectID
    obj_url = "https://collectionapi.metmuseum.org/public/collection/v1/objects/" + str(obj)
    response = requests.get(obj_url)
    obj_data = response.json()
    primaryimage_url = obj_data['primaryImage']
    # sometimes an object might not have a primaryImage URL. we don't want empty strings so let's filter those out
    if(primaryimage_url):
        img_urls.append(primaryimage_url)

img_urls

In [None]:
# we are importing the io module from scikit-image library
# https://scikit-image.org/
# scikit-image is a collection of algorithms for image processing.
# the io module gives us utilities to read and write images in various formats.
# we are also importing opencv. you'll get to learn more about this library this weekend
from skimage import io
import cv2

In [None]:
# see our images directly in the notebook
for index, url in enumerate(img_urls): # enumerate(list) gives us access to index numbers!
    image = io.imread(url) # using io.imread to get images from our url
    plt.imshow(image) # using matplotlib's imshow to read the images
    plt.show() # using plt.show to plot our images

In [None]:
# see our images directly in the notebook with color correction
for index, url in enumerate(img_urls):
    if index == 3: # this particular image at index 3 has an BGR color layer, so we are using the COLOR_BGR2RGB method from cv2 to change it back to RGB
        image = io.imread(url)
        cc_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        plt.imshow(cc_image)
    else:
        image = io.imread(url)
        plt.imshow(image)
    plt.show()

In [None]:
# PIL, short for Python Imaging Library,is a free and open-source additional library
# that adds support for opening, manipulating, and saving many different image file formats.
# from Python's built in io library, we are importing Bytes.IO
# read more here: https://docs.python.org/3/library/io.html
from PIL import Image
from io import BytesIO

In [None]:
# open in preview:
for url in img_urls:  
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img.show()

In [None]:
# download our images:
# first: create a folder named images
# we can use urllib to download the image urls: for opening and reading URLs
import urllib.request
# we want to add a sleeper to not get blocked 
import time 
import random

# Iterate over DataFrame rows as (index, row) pairs
for index, url in enumerate(img_urls):
    # Sets the file name as everything after the / and the end of the link
    file_name = url.split('/')[-1]
    print(file_name)
    # download our images
    urllib.request.urlretrieve(url, 'images/' + file_name)
    # adding random x second break in between each image so that the Met doesn't think we're hacking into their system
    time.sleep(random.randint(1, 5))

## Automation

In [None]:
# we need to import two libraries for our automation task:
# 1. os -> This module provides a portable way of using operating system dependent functionality.
# 2. time -> This module provides various time-related functions.
import os
import time

### creating 100 files

We are using python to programmatically create 100 txt files in a folder called pages. this will need the os and time libraries, so make sure to run the cell above. have the pages folder open so you can see the files being created in real time!

In [None]:
# you need to create a folder called pages in the directory you're working in
# this is where our txt files will be populated.
path = "./pages"

# this creates the pages that we want to work with.
# it assumes a pages folder exists within the same context as the notebook
for i in range(100): # 100 files get created
    page = i + 1 # i starts at 0 so we want to add 1 so our files are from 1 - 100 and not 0 - 99.
    # ./pages/page.txt
    # open() creates the file, write() creates the writing
    # close() saves and leaves the file
    f = open("./pages/" + str(page) + ".txt", "w+")
    f.write("This is page %s." % (page)  )
    f.write("\r\rThis is where your amazing writing will go.\r\r"  )
    f.write("This is the end of page %s." % (page)  )
    f.close()
    #this pauses execution for a specified time so you can watch it happen
    print(str(page) +'.txt has been created')
    time.sleep(0.25) # delay execution for .25 seconds

### creating the folders

Here we are creating folders to put all of the 100 files. we are creating 10 folders in total (e.g. 10, 20, 30, etc). we will then put the files in their respective folders by 10s.

In [None]:
path = "./pages/"
textfiles = os.listdir(path)

#uncomment the following line to see
#print(textfiles) 

#This creates folders for 0 to 100 in increments of ten
for i in range(0,11):
    # i goes from 1 to 10 and we multply it by ten to make a consistent jump
    # we then use that value to create the directory we want to have
    directory = path + str(i * 10) # ./pages/10
    # check if the path exists
    if not os.path.exists(directory):
        # if not, then make the folder
        os.makedirs(directory)
    # report completion & delay for .3 seconds
    print(directory + " has been created!")
    time.sleep(.3)

### moving the files into the folders

Finally, we are going to move all of our txt files into their folders.

In [None]:
# this is a method to take a path and split it into name and extension
def GetFilenameData(fullName):
    #split separates our file name into something specific
    fullnameSplit = fullName.split('.')
    #this split is actually naive and is broken with names that have two or
    #more . marks in them
    fileName = {
        'name': fullnameSplit[0],
        #this if statement is a Ternary operation that returns none if the
        #extension doesn't exist
        'extension': fullnameSplit[1] if (len(fullnameSplit) > 1) else None
    }
    return fileName


# get all the paths within the folder
path = "./pages/"
textfiles = os.listdir(path)


# check for all files within a certain range and move it into an appropriate folder
for file in textfiles:
    filenameData = GetFilenameData(file)
    #Make sure we aren't working with a folder
    if filenameData['extension'] is not None and filenameData['name'] is not '':
        #this 0,11 range is the same range we used to create the folders
        for i in range(0,11):
            if int(filenameData['name']) in range(i*10, (i+1)*10):
                os.rename(path+file, path + str(i*10) +"/" + file)
                # ./pages/filename.extension
                # ./pages/GroupedPages/filename.extension
                print("%s.%s has moved to /%s" % (filenameData['name'], filenameData['extension'], i * 10))
    time.sleep(.3)
    
''' This is the basic functionality of the for loop above
for file in textfiles:
    filenameData = GetFilenameData(file)
    if filenameData['extension'] in not None:
        if int(filenameData['name']) in range(0,10):
            #path + file is the current file path
            #path + "0/" + file is the new path we created
            os.rename(path+file, path + "0/" + file
'''