# ArtsyCollector

## Objective 
To collect artwork data from Artsy.net

### 1. Scrape links for all the artworks on sale currently

In [None]:
# import libraries
import requests
import pprint
import lxml
from selenium import webdriver
from time import sleep

# specify the base URL
BASE_URL = "https://www.artsy.net/collect?page=1&acquireable=true&offerable=false&at_auction=false"

# launch the Selenium Chrome driver
driver = webdriver.Chrome('chrome_driver/chromedriver')
driver.get(BASE_URL)

# find the total number of pages in this search
page_nav = driver.find_elements_by_xpath('//div[@class="Box-sc-15se88d-0 iMqpar"]')
page_total = int(page_nav[-1].find_element_by_tag_name("a").text)

# specify the links data object
links = []

# loop over each page
for i in range(page_total):

    # open the page with Selenium
    page_counter = i + 1
    URL = "https://www.artsy.net/collect?page=" + str(page_counter) + "&acquireable=true&offerable=false&at_auction=false"
    driver = webdriver.Chrome('chrome_driver/chromedriver')
    driver.get(URL)
    sleep(5)

    # find all the artworks on the page
    artworksOnPage = driver.find_elements_by_xpath('//div[@data-test="artworkGridItem"]')

    for entry in artworksOnPage:
        # find all the links to those artworks on the page
        links.append(entry.find_element_by_tag_name("a").get_attribute("href"))
    
    driver.close()
    sleep(5)


# print the total number of links we have
print(len(links))


In [None]:
# write the links file to a separate .txt file for backup
with open('links_list.txt', 'w') as filehandle:
    for listitem in links:
        filehandle.write('%s\n' % listitem)

### 2. Scrape information from each of those artwork links

In [None]:
# read the list of links
links_list = []
with open('links_list.txt', 'r') as filehandle:
    for line in filehandle:
        currentLink = line[:-1]
        links_list.append(currentLink)

In [None]:
# import libraries
import requests
import pprint
import lxml
from selenium import webdriver
from time import sleep

data = {"page_URL": [], "artist": [], "artist_nationality": [], "artist_birthdate": [], "title": [], "image_URL": [], "year": [], "gallery": [], "gallery_location": [], "medium": [], "medium_details": [], "size_inches": [], "size_cm": [], "condition": [], "classification": [], "signed": [], "authenticated": [], "framed": [], "currency": [], "price": []}


driver = webdriver.Chrome('chrome_driver/chromedriver')

# for diagnostics ONLY
links_dummy = ["https://www.artsy.net/artwork/karl-hartman-uncommon-crow"]


# loop over all entries in the links list

for entry in links_list:
    URL = entry
    data["page_URL"].append(URL)

    driver = webdriver.Chrome('chrome_driver/chromedriver')
    driver.get(URL)
    sleep(2)

    # extract information from image area

    image_URL = driver.find_element_by_xpath('//img[@data-type="artwork-image"]').get_attribute("src")

    # extract information from sidebar area

    sidebar = driver.find_element_by_xpath('//div[@data-test="artworkSidebar"]').text.split("\n")
    data["image_URL"].append(image_URL)
    data["artist"].append(sidebar[0])
    data["title"].append(sidebar[2].split(", ")[0])
    data["year"].append(sidebar[2].split(", ")[1])
    data["medium_details"].append(sidebar[3])
    data["size_inches"].append(sidebar[4])
    data["size_cm"].append(sidebar[5])
    data["classification"].append(sidebar[6])
    data["currency"].append(sidebar[7][0:1])
    data["price"].append(sidebar[7][1:])
    data["gallery"].append(sidebar[12])
    data["gallery_location"].append(sidebar[13])

    # extract information from caption area

    caption = driver.find_element_by_xpath('//div[@data-test="artworkDetails"]').text.split("\n")

    if "Medium" in caption:
        index = caption.index("Medium")
        data["medium"].append(caption[index+1])
    else:
        data["medium"].append("NA")

    if "Condition" in caption:
        index = caption.index("Condition")
        data["condition"].append(caption[index+1])
    else:
        data["condition"].append("NA")

    if "Signature" in caption:
        index = caption.index("Signature")
        data["signed"].append(caption[index+1])
    else:
        data["signed"].append("NA")

    if "Certificate of authenticity" in caption:
        index = caption.index("Certificate of authenticity")
        data["authenticated"].append(caption[index+1])
    else:
        data["authenticated"].append("NA")

    if "Frame" in caption:
        index = caption.index("Frame")
        data["framed"].append(caption[index+1])
    else:
        data["framed"].append("NA")

    # extract information from artist bio area

    bio = driver.find_element_by_xpath('//div[@data-test="artistInfo"]').text.split("\n")

    if "b." in bio:
        data["artist_nationality"].append(bio[1].split(", b. ")[0].translate({ord('•'): None}))
        data["artist_birthdate"].append(bio[1].split(", b. ")[1][0:4].translate({ord('•'): None}))
    else:
        data["artist_nationality"].append(bio[1].translate({ord('•'): None}))
        data["artist_birthdate"].append("NA")
    

    # close the open window
    print(index(entry))
    driver.close()
    sleep(2)


# quit driver and print data

driver.quit()
print(len(data))
# print(data)

In [None]:
print(len(data))

### 3. Attempt a quicker solution with Beautiful Soup

In [47]:
import requests
import urllib3
from bs4 import BeautifulSoup

URL = "https://www.artsy.net/artwork/karl-hartman-uncommon-crow"
http = urllib3.PoolManager()

response = http.request('GET', URL)
soup = BeautifulSoup(response.data, "lxml")

image_URL = soup.img['src']
print(image_URL)

sidebar = driver.find_element_by_xpath('//div[@data-test="artworkSidebar"]').text.split("\n")
    data["image_URL"].append(image_URL)
    data["artist"].append(sidebar[0])
    data["title"].append(sidebar[2].split(", ")[0])
    data["year"].append(sidebar[2].split(", ")[1])
    data["medium_details"].append(sidebar[3])
    data["size_inches"].append(sidebar[4])
    data["size_cm"].append(sidebar[5])
    data["classification"].append(sidebar[6])
    data["currency"].append(sidebar[7][0:1])
    data["price"].append(sidebar[7][1:])
    data["gallery"].append(sidebar[12])
    data["gallery_location"].append(sidebar[13])


https://d32dm0rphc51dk.cloudfront.net/Av-Aaqu_JwskxttJT4z_Vg/large.jpg


In [None]:
import scrapy

class ImageSpider(scrapy.Spider):

    name: 'images'

    def start_requests(self):
        urls = [
            'https://www.artsy.net/artwork/karl-hartman-uncommon-crow'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        page = response.

In [None]:
## API solution

import requests

headers = {'X-Auth-Token': 'eyJhbGciOiJIUzI1NiJ9.eyJyb2xlcyI6IiIsInN1YmplY3RfYXBwbGljYXRpb24iOiI2MDE0NDk3MjNlZjg1MjAwMTE3YTBiNGUiLCJleHAiOjE2MTI1NDcwNTksImlhdCI6MTYxMTk0MjI1OSwiYXVkIjoiNjAxNDQ5NzIzZWY4NTIwMDExN2EwYjRlIiwiaXNzIjoiR3Jhdml0eSIsImp0aSI6IjYwMTQ0OTczODNkMmEyMDAxMTFjZDBiNiJ9.Jb7HeMLSEsdEO69PxqM80B4pEqGqxPzFg2cYUE9TfyI'}

r = requests.get("https://api.artsy.net/api/sales", headers=headers)
r.json()