# ArtsyCollector

## Objective 
To collect artwork data from Artsy.net

### 1. Scrape links for all the artworks on sale currently

In [None]:
# import libraries
import requests
import pprint
import lxml
from selenium import webdriver
from time import sleep

# specify the base URL
BASE_URL = "https://www.artsy.net/collect?page=1&acquireable=true&offerable=false&at_auction=false"

# launch the Selenium Chrome driver
driver = webdriver.Chrome('chrome_driver/chromedriver')
driver.get(BASE_URL)

# find the total number of pages in this search
page_nav = driver.find_elements_by_xpath('//div[@class="Box-sc-15se88d-0 iMqpar"]')
page_total = int(page_nav[-1].find_element_by_tag_name("a").text)

# specify the links data object
links = []

# start the Chrome webdriver
driver = webdriver.Chrome('chrome_driver/chromedriver')

# loop over each page
for i in range(page_total):

    # open the page with Selenium
    page_counter = i + 1
    URL = "https://www.artsy.net/collect?page=" + str(page_counter) + "&acquireable=true&offerable=false&at_auction=false"
    driver.get(URL)
    sleep(2)

    # find all the artworks on the page
    artworksOnPage = driver.find_elements_by_xpath('//div[@data-test="artworkGridItem"]')

    for entry in artworksOnPage:
        # find all the links to those artworks on the page
        links.append(entry.find_element_by_tag_name("a").get_attribute("href"))

# print the total number of links we have
print(len(links))

# write the links file to a separate .txt file for backup
with open('links_list.txt', 'w') as filehandle:
    for listitem in links:
        filehandle.write('%s\n' % listitem)

### 2. Scrape information from each of those artwork links with Chrome Selenium

In [68]:
# import libraries
import requests
import pprint
import lxml
import pandas as pd
from selenium import webdriver
from time import sleep


# define the data structure

data = {"page_url": [], "artist": [], "artist_nationality": [], "artist_birthdate": [], "title": [], "image_url": [], "year": [], "gallery": [], "gallery_location": [], "medium": [], "medium_details": [], "size_inches": [], "size_cm": [], "condition": [], "classification": [], "signed": [], "authenticated": [], "framed": [], "currency": [], "price": []}
currency_symbols = ["$", "€", "£"]
digits_str = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]


# set up the Selenium driver in Chrome

driver = webdriver.Chrome('chrome_driver/chromedriver')
verbose = False

# read the list of links

## actual
links_list = []
with open('links_list.txt', 'r') as filehandle:
    for line in filehandle:
        currentLink = line[:-1]
        links_list.append(currentLink)

## for diagnostics ONLY
links_dummy = ["https://www.artsy.net/artwork/karl-hartman-dried-grass", "https://www.artsy.net/artwork/karl-hartman-uncommon-crow", "https://www.artsy.net/artwork/steven-alexander-nomad-3", "https://www.artsy.net/artwork/hollis-dunlap-inner-dialogues", "https://www.artsy.net/artwork/jaena-kwon-red-petals"]



# loop over all entries in the links list

for index, entry in enumerate(links_dummy):
    print(index+1, entry)
    url = entry
    driver.get(url)
    sleep(1)

    # check if it's an error page

    ## modern error page - variation 1
    error_message_1 = driver.find_elements_by_class_name("error-handler-body")

    ## modern error page - variation 2
    error_message_2 = driver.find_elements_by_class_name("error-message")
    
    ## old-style error page
    if "Error" in driver.title or "error" in driver.title:
        error_title = True
    else:
        error_title = False
    
    # if any of these errors show up, append "error" to data object and skip the rest of the code

    if len(error_message_1)>0 or len(error_message_2)>0 or error_title:

        # append "error" for everything except the page URL
        for key in data:

            if key == "page_url":
                data[key].append(url)

            else:
                data[key].append("error")

                artist = "error"
                artist_birthdate = "error"
                artist_nationality = "error"
                authenticated = "error"
                classification = "error"
                condition = "error"
                currency = "error"
                framed = "error"
                gallery = "error"
                gallery_location = "error"
                image_url = "error"
                medium = "error"
                medium_details = "error"
                price = "error"
                signed = "error"
                size_cm = "error"
                size_inches = "error"
                title = "error"
                year = "error"

    
    # if it's not an error page, proceed with data extraction
    else:

        # IMAGE AREA

        ## check if the image area exists
        try:
            driver.find_elements_by_xpath('//img[@data-type="artwork-image"]')
        except:
            image_url = "NA"
        else:
            ## IMAGE URL
            image = driver.find_elements_by_xpath('//img[@data-type="artwork-image"]')
            if len(image) > 0:
                image = image[0]
                image_url = image.get_attribute("src")
            else:
                image_url = "NA"
        if verbose:
            print(image_url)


        # SIDEBAR AREA

        ## check if the sidebar area exists
        try:
            driver.find_elements_by_xpath('//div[@data-test="artworkSidebar"]')
        except:
            artist = "NA"
            title = "NA"
            year = "NA"
            medium_details = "NA"
            size_inches = "NA"
            size_cm = "NA"
            classification = "NA"
            currency = "NA"
            price = "NA"
            gallery = "NA"
            gallery_location = "NA"
        else:
            sidebar = driver.find_elements_by_xpath('//div[@data-test="artworkSidebar"]')
            if len(sidebar) > 0:
                sidebar = sidebar[0]

                ## ARTIST
                try:
                    artist = sidebar.text.split("\n")[0]
                except:
                    artist = "NA"
                else:
                    artist = sidebar.text.split("\n")[0]

                ## TITLE & YEAR
                try:
                    sidebar.text.split("\n")[2]
                except:
                    title = "NA"
                    year = "NA"
                else:
                    line = sidebar.text.split("\n")[2]
                    if "," in line:
                        title = line.split(", ")[0]
                        year = line.split(", ")[1]
                    else:
                        if "1" in line:
                            title = "NA"
                            year = line
                        else:
                            title = line
                            year = "NA"

                ## MEDIUM DETAILS
                try:
                    medium_details = sidebar.text.split("\n")[3]
                except:
                    medium_details = "NA"
                else:
                    medium_details = sidebar.text.split("\n")[3]
                
                ## SIZE (INCHES)
                try:
                    size_inches = sidebar.text.split("\n")[4]
                except:
                    size_inches = "NA"
                else:
                    size_inches = sidebar.text.split("\n")[4]

                ## SIZE (CM)
                try:
                    size_cm = sidebar.text.split("\n")[5]
                except:
                    size_cm = "NA"
                else:
                    size_cm = sidebar.text.split("\n")[5]

                ## CLASSIFICATION
                try:
                    classification = sidebar.text.split("\n")[6]
                except:
                    classification = "NA"
                else:
                    classification = sidebar.text.split("\n")[6]

                ## PRICE & CURRENCY
                try:
                    sidebar.text.split("\n")[7]
                except:
                    price = "NA"
                    currency = "NA"
                else:
                    line = sidebar.text.split("\n")[7]
                    if any(element in line for element in digits_str):
                        currency = line[0]
                        price = line[1:]
                    else:
                        currency = "NA"
                        price = "NA"
                    
                    ## GALLERY & GALLERY LOCATION (altered position)
                    if line == "Sold":
                        try:
                            gallery = sidebar.text.split("\n")[8]
                        except:
                            gallery = "NA"
                        else:
                            gallery = sidebar.text.split("\n")[8]
                        
                        try:
                            gallery_location = sidebar.text.split("\n")[9]
                        except:
                            gallery_location = "NA"
                        else:
                            gallery_location = sidebar.text.split("\n")[9]
                    else:
                        try:
                            gallery = sidebar.text.split("\n")[12]
                        except:
                            gallery = "NA"
                        else:
                            gallery = sidebar.text.split("\n")[12]
                        
                        try:
                            gallery_location = sidebar.text.split("\n")[13]
                        except:
                            gallery_location = "NA"
                        else:
                            gallery_location = sidebar.text.split("\n")[13]
            
            else:
                artist = "NA"
                title = "NA"
                year = "NA"
                medium_details = "NA"
                size_inches = "NA"
                size_cm = "NA"
                classification = "NA"
                currency = "NA"
                price = "NA"
                gallery = "NA"
                gallery_location = "NA"

        if verbose:
            print(artist)
            print(title)
            print(year)
            print(medium_details)
            print(size_inches)
            print(size_cm)
            print(classification)
            print(currency)
            print(price)
            print(gallery)
            print(gallery_location)
        

        # CAPTION AREA

        try:
            driver.find_elements_by_xpath('//div[@data-test="artworkDetails"]')
        except:
            medium = "NA"
            condition = "NA"
            signed = "NA"
            authenticated = "NA"
            framed = "NA"
        else:
            caption = driver.find_elements_by_xpath('//div[@data-test="artworkDetails"]')
            if len(caption) > 0:
                try:
                    caption[0].text.split("\n")
                except:
                    medium = "NA"
                    condition = "NA"
                    signed = "NA"
                    authenticated = "NA"
                    framed = "NA"
                else:
                    caption_text = caption[0].text.split("\n")
                    ## MEDIUM
                    if "Medium" in caption_text:
                        index = caption_text.index("Medium")
                        medium = caption_text[index+1]
                    else:
                        medium = "NA"

                    ## CONDITION
                    if "Condition" in caption_text:
                        index = caption_text.index("Condition")
                        condition = caption_text[index+1]
                    else:
                        condition = "NA"

                    ## SIGNED
                    if "Signature" in caption_text:
                        index = caption_text.index("Signature")
                        signed = caption_text[index+1]
                    else:
                        signed = "NA"

                    ## AUTHENTICATED
                    if "Certificate of authenticity" in caption_text:
                        index = caption_text.index("Certificate of authenticity")
                        authenticated = caption_text[index+1]
                    else:
                        authenticated = "NA"

                    ## FRAMED
                    if "Frame" in caption_text:
                        index = caption_text.index("Frame")
                        framed = caption_text[index+1]
                    else:
                        framed = "NA"

            else:
                medium = "NA"
                condition = "NA"
                signed = "NA"
                authenticated = "NA"
                framed = "NA"
        
        if verbose:
            print(medium)
            print(condition)
            print(signed)
            print(authenticated)
            print(framed)
        

        # ARTIST BIO AREA

        try:
            driver.find_elements_by_xpath('//div[@data-test="artistInfo"]')
        except:
            artist_nationality = "NA"
            artist_birthdate = "NA"
        else:
            bio = driver.find_elements_by_xpath('//div[@data-test="artistInfo"]')
            if len(bio) > 0:
                try:
                    bio[0].text.split("\n")
                except:
                    artist_nationality = "NA"
                    artist_birthdate = "NA"
                else:
                    bio_text = bio[0].text.split("\n")
                   # if there is a birthdate
                    if "b." in bio_text[1]:
                        # if there is only a birthdate
                        if bio_text[1][0] == "b":
                            artist_nationality = "NA"
                            artist_birthdate = bio_text[1].split("b. ")[1][0:4].translate({ord('•'): None})
                        # if there is both a birthdate and a nationality
                        else:
                            artist_nationality = bio_text[1].split(", b. ")[0].translate({ord('•'): None})
                            artist_birthdate = bio_text[1].split(", b. ")[1][0:4].translate({ord('•'): None})
                    
                    # if there is no birthdate
                    else:
                        # if there is only a nationality
                        if len(bio_text[1]) > 0:
                            artist_nationality = bio_text[1].translate({ord('•'): None})
                            artist_birthdate = "NA"
                        # if there is no birthdate or nationality
                        else:
                            artist_nationality = "NA"
                            artist_birthdate = "NA" 
            else:
                artist_nationality = "NA"
                artist_birthdate = "NA"
            
            if verbose:
                print(artist_nationality)
                print(artist_birthdate)


        # append all collected information to the data object

        data["artist"].append(artist)
        data["artist_birthdate"].append(artist_birthdate)
        data["artist_nationality"].append(artist_nationality)
        data["authenticated"].append(authenticated)
        data["classification"].append(classification)
        data["condition"].append(condition)
        data["currency"].append(currency)
        data["framed"].append(framed)
        data["gallery"].append(gallery)
        data["gallery_location"].append(gallery_location)
        data["image_url"].append(image_url)
        data["medium"].append(medium)
        data["medium_details"].append(medium_details)
        data["page_url"].append(url)
        data["price"].append(price)
        data["signed"].append(signed)
        data["size_inches"].append(size_inches)
        data["size_cm"].append(size_cm)
        data["title"].append(title)
        data["year"].append(year)
        
        
    # print updated data
    if verbose == False:
        print(title, artist, price, currency, artist_nationality, artist_birthdate, framed, authenticated, signed)


# quit driver and print data
driver.quit()
print(len(data["title"]))

1 https://www.artsy.net/artwork/karl-hartman-dried-grass
Dried Grass Karl Hartman NA NA American NA Not included Included Hand-signed by artist, sticker label, Signed lower right corner, "Karl Hartman"
2 https://www.artsy.net/artwork/karl-hartman-uncommon-crow
Uncommon Crow Karl Hartman 2,300 $ American NA Not included Included Sticker label
3 https://www.artsy.net/artwork/steven-alexander-nomad-3
error error error error error error error error error
4 https://www.artsy.net/artwork/hollis-dunlap-inner-dialogues
Inner Dialogues Hollis Dunlap 4,500 $ American 1977 Not included NA Hand-signed by artist
5 https://www.artsy.net/artwork/jaena-kwon-red-petals
Red Petals Jaena Kwon 5,200 $ NA 1986 Not included Included Signed on verso
5


In [69]:
from pprint import pprint
pprint(data)

{'artist': ['Karl Hartman',
            'Karl Hartman',
            'error',
            'Hollis Dunlap',
            'Jaena Kwon'],
 'artist_birthdate': ['NA', 'NA', 'error', '1977', '1986'],
 'artist_nationality': ['American', 'American', 'error', 'American', 'NA'],
 'authenticated': ['Included', 'Included', 'error', 'NA', 'Included'],
 'classification': ['This is a unique work.',
                    'This is a unique work.',
                    'error',
                    'This is a unique work.',
                    'This is a unique work.'],
 'condition': ['Excellent', 'Excellent', 'error', 'NA', 'Mint condition'],
 'currency': ['NA', '$', 'error', '$', '$'],
 'framed': ['Not included',
            'Not included',
            'error',
            'Not included',
            'Not included'],
 'gallery': ['Garvey | Simon',
             'Garvey | Simon',
             'error',
             'Gallery 1261',
             'Amy Simon Fine Art'],
 'gallery_location': ['New York', 'New York

In [None]:
import json
import pandas as pd


# open the data backup file

with open('artworks_list.json') as f:
  data_copy = json.load(f)



# calculate the lengths for all columns in the data object

def calculateLengths(df, verbose=True):
    column_dim = {"column_name": [], "length": []}
    for key in df:
        column_dim["column_name"].append(key)
        column_dim["length"].append(len(df[key]))
        if verbose:
            print(key, len(df[key]))
    return column_dim

column_dim = calculateLengths(data_copy, verbose=False)



# calculate the minimum column length

min_length = min(column_dim["length"])


# bevel the columns to the minimum length

for key in data_copy:
    if len(data_copy[key]) > min_length:
        distance = (len(data_copy[key]) - min_length) * -1
        data_copy[key] = data_copy[key][:distance]

calculateLengths(data_copy)



# save as a JSON
json.dump(data_copy, open("artworks_list_evened.json", 'w'))



# save as a CSV from pandas
df = pd.DataFrame(data_copy)
df.to_csv('artworks_list_evened.csv')

In [None]:
from pprint import pprint
pprint(data)

### 3A. Attempt a quicker solution with Beautiful Soup

In [None]:
import requests
import urllib3
from bs4 import BeautifulSoup

URL = "https://www.artsy.net/artwork/karl-hartman-uncommon-crow"
http = urllib3.PoolManager()

response = http.request('GET', URL)
soup = BeautifulSoup(response.data, "lxml")

image_URL = soup.img['src']
print(image_URL)

sidebar = driver.find_element_by_xpath('//div[@data-test="artworkSidebar"]').text.split("\n")
    data["image_URL"].append(image_URL)
    data["artist"].append(sidebar[0])
    data["title"].append(sidebar[2].split(", ")[0])
    data["year"].append(sidebar[2].split(", ")[1])
    data["medium_details"].append(sidebar[3])
    data["size_inches"].append(sidebar[4])
    data["size_cm"].append(sidebar[5])
    data["classification"].append(sidebar[6])
    data["currency"].append(sidebar[7][0:1])
    data["price"].append(sidebar[7][1:])
    data["gallery"].append(sidebar[12])
    data["gallery_location"].append(sidebar[13])


### 3B. Attempt a quicker solution with Scrapy

In [None]:
import scrapy

class ImageSpider(scrapy.Spider):

    name: 'images'

    def start_requests(self):
        urls = [
            'https://www.artsy.net/artwork/karl-hartman-uncommon-crow'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        page = response.

### 3C. Attempt a quicker solution with the Artsy API

In [None]:
# get authorization token

import requests

client_id = "e7f6428cf01e0bd5a170"
client_secret = "33fc3469753cc4bf54833b9c2dc21c85"
api_url = "https://api.artsy.net/api/tokens/xapp_token"

data = {
    "client_id": client_id,
    "client_secret": client_secret,
}

auth_response = requests.post(api_url, data=data)
auth_response.json()
auth_token = auth_response.json()["token"]
auth_token_type = auth_response.json()["type"]

In [None]:
import json

TEST_URL = "https://api.artsy.net/api/artists/hollis-dunlap"
DERIVED_URL = "https://api.artsy.net/api/artworks?artist_id=519812e68b3b81124c000193"
URL = "https://api.artsy.net/api/sales?live=true&total_count=10"

r = requests.get(DERIVED_URL, headers={'X-Xapp-Token': auth_token})
sales_json = r.json()

print(json.dumps(sales_json, indent=4, sort_keys=True))
len(sales_json)

In [None]:
sales_json["_embedded"]

### 4. Clean the data in Pandas

In [None]:
import pandas as pd

