# ArtsyCollector

## Objective 
To collect artwork data from Artsy.net

### 1. Scrape links for all the artworks on sale currently

In [29]:
# import libraries
import requests
import pprint
import lxml
from selenium import webdriver
from time import sleep

# specify the base URL
BASE_URL = "https://www.artsy.net/collect?page=1&acquireable=true&offerable=false&at_auction=false"

# launch the Selenium Chrome driver
driver = webdriver.Chrome('chrome_driver/chromedriver')
driver.get(BASE_URL)

# find the total number of pages in this search
page_nav = driver.find_elements_by_xpath('//div[@class="Box-sc-15se88d-0 iMqpar"]')
page_total = int(page_nav[-1].find_element_by_tag_name("a").text)

# specify the links data object
links = []

# start the Chrome webdriver
driver = webdriver.Chrome('chrome_driver/chromedriver')

# loop over each page
for i in range(page_total):

    # open the page with Selenium
    page_counter = i + 1
    URL = "https://www.artsy.net/collect?page=" + str(page_counter) + "&acquireable=true&offerable=false&at_auction=false"
    driver.get(URL)
    sleep(2)

    # find all the artworks on the page
    artworksOnPage = driver.find_elements_by_xpath('//div[@data-test="artworkGridItem"]')

    for entry in artworksOnPage:
        # find all the links to those artworks on the page
        links.append(entry.find_element_by_tag_name("a").get_attribute("href"))

# print the total number of links we have
print(len(links))

# write the links file to a separate .txt file for backup
with open('links_list.txt', 'w') as filehandle:
    for listitem in links:
        filehandle.write('%s\n' % listitem)

2970


### 2. Scrape information from each of those artwork links with Chrome Selenium

In [30]:
# import libraries
import requests
import pprint
import lxml
import pandas as pd
from selenium import webdriver
from time import sleep


# define the data structure

data = {"page_URL": [], "artist": [], "artist_nationality": [], "artist_birthdate": [], "title": [], "image_URL": [], "year": [], "gallery": [], "gallery_location": [], "medium": [], "medium_details": [], "size_inches": [], "size_cm": [], "condition": [], "classification": [], "signed": [], "authenticated": [], "framed": [], "currency": [], "price": []}


# set up the Selenium driver

driver = webdriver.Chrome('chrome_driver/chromedriver')


# read the list of links

## actual
links_list = []
with open('links_list.txt', 'r') as filehandle:
    for line in filehandle:
        currentLink = line[:-1]
        links_list.append(currentLink)

## for diagnostics ONLY
links_dummy = ["https://www.artsy.net/artwork/karl-hartman-uncommon-crow", "https://www.artsy.net/artwork/hollis-dunlap-inner-dialogues", "https://www.artsy.net/artwork/jaena-kwon-red-petals"]



# loop over all entries in the links list

for index, entry in enumerate(links_list):
    print(index+1, entry)
    URL = entry
    data["page_URL"].append(URL)
    driver.get(URL)
    sleep(1)

    # extract information from image area

    image_URL = driver.find_element_by_xpath('//img[@data-type="artwork-image"]').get_attribute("src")
    data["image_URL"].append(image_URL)

    # extract information from sidebar area

    sidebar = driver.find_element_by_xpath('//div[@data-test="artworkSidebar"]').text.split("\n")

    artist = sidebar[0]
    title = sidebar[2].split(", ")[0]
    year = sidebar[2].split(", ")[1]
    medium_details = sidebar[3]
    size_inches = sidebar[4]
    size_cm = sidebar[5]
    classification = sidebar[6]
    currency = sidebar[7][0:1]
    price = sidebar[7][1:]
    gallery = sidebar[12]
    gallery_location = sidebar[13]

    data["artist"].append(sidebar[0])
    data["title"].append(sidebar[2].split(", ")[0])
    data["year"].append(sidebar[2].split(", ")[1])
    data["medium_details"].append(sidebar[3])
    data["size_inches"].append(sidebar[4])
    data["size_cm"].append(sidebar[5])
    data["classification"].append(sidebar[6])
    data["currency"].append(sidebar[7][0:1])
    data["price"].append(sidebar[7][1:])
    data["gallery"].append(sidebar[12])
    data["gallery_location"].append(sidebar[13])

    # extract information from caption area

    caption = driver.find_element_by_xpath('//div[@data-test="artworkDetails"]').text.split("\n")

    medium = ""
    if "Medium" in caption:
        index = caption.index("Medium")
        medium = caption[index+1]
    else:
        medium = "NA"
    data["medium"].append(medium)

    condition = ""
    if "Condition" in caption:
        index = caption.index("Condition")
        condition = caption[index+1]
    else:
        condition = "NA"
    data["condition"].append(condition)

    signed = ""
    if "Signature" in caption:
        index = caption.index("Signature")
        signed = caption[index+1]
    else:
        signed = "NA"
    data["signed"].append(signed)

    authenticated = ""
    if "Certificate of authenticity" in caption:
        index = caption.index("Certificate of authenticity")
        authenticated = caption[index+1]
    else:
        authenticated = "NA"
    data["authenticated"].append(authenticated)

    framed = ""
    if "Frame" in caption:
        index = caption.index("Frame")
        framed = caption[index+1]
    else:
        framed = "NA"
    data["framed"].append(framed)

    # extract information from artist bio area

    bio = driver.find_element_by_xpath('//div[@data-test="artistInfo"]').text.split("\n")
    
    artist_nationality = ""
    artist_birthdate = ""

    # if there is a birthdate
    if "b." in bio[1]:
        # if there is only a birthdate
        if bio[1][0] == "b":
            artist_nationality = "NA"
            artist_birthdate = bio[1].split("b. ")[1][0:4].translate({ord('•'): None})
        # if there is both a birthdate and a nationality
        else:
            artist_nationality = bio[1].split(", b. ")[0].translate({ord('•'): None})
            artist_birthdate = bio[1].split(", b. ")[1][0:4].translate({ord('•'): None})
    
    # if there is no birthdate
    else:
        # if there is only a nationality
        if len(bio[1]) > 0:
            artist_nationality = bio[1].translate({ord('•'): None})
            artist_birthdate = "NA"
        # if there is no birthdate or nationality
        else:
            artist_nationality = "NA"
            artist_birthdate = "NA"
    
    data["artist_nationality"].append(artist_nationality)
    data["artist_birthdate"].append(artist_birthdate)
    
    # print updated data
    print(title, artist, price, currency, artist_nationality, artist_birthdate, framed, authenticated, signed)


# quit driver and print data
driver.quit()
print(len(data["title"]))

1 https://www.artsy.net/artwork/yigal-ozeri-untitled-pricilla-3
Untitled "Pricilla" Yigal Ozeri 55,000 $ Israeli 1958 Included Included Hand-signed by artist
2 https://www.artsy.net/artwork/josh-reames-columbo-ii
Columbo Josh Reames 11,000 $ American 1985 Not included Included Hand-signed by artist, sticker label
3 https://www.artsy.net/artwork/ilana-manolson-fostering-the-next
Fostering the Next Ilana Manolson 16,000 $ Canadian 1956 Not included NA NA
4 https://www.artsy.net/artwork/samuel-john-lamorna-birch-the-shady-pool
The Shady Pool Samuel John Lamorna Birch 22,500 £ Follow NA Included NA Hand-signed by artist, Signed and dated '1955' lower left
5 https://www.artsy.net/artwork/ilana-manolson-navigating-the-narrative
Navigating the Narrative Ilana Manolson 20,000 $ Canadian 1956 Not included NA NA
6 https://www.artsy.net/artwork/ilana-manolson-charting-the-fleeting-1
Charting the Fleeting Ilana Manolson 18,000 $ Canadian 1956 Not included NA NA
7 https://www.artsy.net/artwork/seen

KeyboardInterrupt: 

In [None]:
from pprint import pprint
pprint(data)

### 3A. Attempt a quicker solution with Beautiful Soup

In [None]:
import requests
import urllib3
from bs4 import BeautifulSoup

URL = "https://www.artsy.net/artwork/karl-hartman-uncommon-crow"
http = urllib3.PoolManager()

response = http.request('GET', URL)
soup = BeautifulSoup(response.data, "lxml")

image_URL = soup.img['src']
print(image_URL)

sidebar = driver.find_element_by_xpath('//div[@data-test="artworkSidebar"]').text.split("\n")
    data["image_URL"].append(image_URL)
    data["artist"].append(sidebar[0])
    data["title"].append(sidebar[2].split(", ")[0])
    data["year"].append(sidebar[2].split(", ")[1])
    data["medium_details"].append(sidebar[3])
    data["size_inches"].append(sidebar[4])
    data["size_cm"].append(sidebar[5])
    data["classification"].append(sidebar[6])
    data["currency"].append(sidebar[7][0:1])
    data["price"].append(sidebar[7][1:])
    data["gallery"].append(sidebar[12])
    data["gallery_location"].append(sidebar[13])


### 3B. Attempt a quicker solution with Scrapy

In [None]:
import scrapy

class ImageSpider(scrapy.Spider):

    name: 'images'

    def start_requests(self):
        urls = [
            'https://www.artsy.net/artwork/karl-hartman-uncommon-crow'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        page = response.

### 3C. Attempt a quicker solution with the Artsy API

In [None]:
## API solution

import requests

headers = {'X-Auth-Token': 'eyJhbGciOiJIUzI1NiJ9.eyJyb2xlcyI6IiIsInN1YmplY3RfYXBwbGljYXRpb24iOiI2MDE0NDk3MjNlZjg1MjAwMTE3YTBiNGUiLCJleHAiOjE2MTI1NDcwNTksImlhdCI6MTYxMTk0MjI1OSwiYXVkIjoiNjAxNDQ5NzIzZWY4NTIwMDExN2EwYjRlIiwiaXNzIjoiR3Jhdml0eSIsImp0aSI6IjYwMTQ0OTczODNkMmEyMDAxMTFjZDBiNiJ9.Jb7HeMLSEsdEO69PxqM80B4pEqGqxPzFg2cYUE9TfyI'}

r = requests.get("https://api.artsy.net/api/sales", headers=headers)
r.json()

### 4. Clean the data in Pandas

In [None]:
import pandas as pd

