In [1]:
import loginkey

from selenium import webdriver
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys

from urllib.parse import unquote

import re

import time as time

In [2]:
class element_not_have_css_class(object):
    """An expectation for checking that an element has a particular css class.

    locator - used to find the element
    returns the WebElement once it has the particular css class
    """
    def __init__(self, locator, css_class):
        self.locator = locator
        self.css_class = css_class

    def __call__(self, driver):
        element = driver.find_element(*self.locator)   # Finding the referenced element
        if self.css_class not in element.get_attribute("class"):
            return element
        else:
            return False

In [3]:
def initialize_driver():
    
    from selenium.webdriver.chrome.options import Options
    
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    
    print('initializing driver...')
    
    driver = webdriver.Chrome(options=chrome_options)
    driver.get('https://www.artsy.net/')
    
    print('loaded artsy.net, waiting to log in...')
    
    wait = WebDriverWait(driver, 10)
    time.sleep(2)

    button = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/div[1]/div/header/div/nav/div[1]/a[2]')
    driver.execute_script("arguments[0].click();", button)

    # user = loginkey.user
    # password = loginkey.password

    user = 'michaeljroth815@gmail.com'
    password = 'YudBYiLj$e7u2Xq'
    
    time.sleep(2)
    
    print('logging in...')
    
    wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'hgOWHr')))

    driver.find_element(By.CLASS_NAME, 'hgOWHr').click()
    driver.find_element(By.XPATH, "//input[@name='email']").send_keys(user)
    driver.find_element(By.CLASS_NAME, 'jGXstL').click()
    driver.find_element(By.XPATH, "//input[@name='password']").send_keys(password)
    driver.find_element(By.XPATH, "//button[@type='submit']").click()
    
    print('log in sucessful...')
    
    return driver

In [4]:
def auction_scrape(driver, list_of_artists):
    
    max_pages = 100
    
    wait = WebDriverWait(driver, 10)
    auction_results = []
    count = 0
    
    for a, artist in enumerate(list_of_artists):
        try:

            url = f'https://www.artsy.net/artist/{artist}/auction-results?metric=in'

            driver.get(url)

            time.sleep(2)

            wait.until(EC.element_to_be_clickable((By.XPATH,'//select')))

            select_element = driver.find_elements(By.XPATH,'//select')
            for i in range(len(select_element)):
                select_object = Select(select_element[i])
                try:
                    select_object.select_by_value('PRICE_AND_DATE_DESC')
                    break
                except:
                    pass

            for i in range(max_pages):
                driver.switch_to.window(driver.window_handles[0])

                soup = BeautifulSoup(driver.page_source)

                loading_container = soup.find(class_='LoadingArea__Container-sc-1qecphp-2')
                title_divs = loading_container.find_all(class_='jvPMUs')
                image_divs = loading_container.find_all(class_='fvrqBk')
                info_divs = loading_container.find_all(class_='icNtnl')
                price_divs = loading_container.find_all(class_='iZqizk')

                for i in range(9):
                    try:
                        title = title_divs[i*2].find_all(class_='ldlHGe')[0].text
                    except:
                        title = float("nan")
                    try:
                        image_url = image_divs[i].find('img').get('src')
                        pattern = re.compile("(http|ftp|https)")
                        find_2nd = pattern.findall(image_url)
                        find_http = pattern.split(find_2nd[0][2])
                        image_url = ''.join(find_http[1:])
                    except:
                        image_url = float("nan")
                    try:
                        medium = info_divs[i].find_all(class_='buodgj')[1].contents[0]
                    except:
                        medium = float("nan")
                    try:
                        dimensions = info_divs[i].find_all(class_='buodgj')[1].contents[-1]
                    except:
                        medium = float("nan")
                    try:
                        estimate = info_divs[i].find_all(class_='buodgj')[3].contents[0]
                    except:
                        estimate = float("nan")
                    try:
                        realized_price = info_divs[i].find_all(class_='buodgj')[7].contents[0]
                    except:
                        realized_price = float("nan")
                    try:
                        try:
                            price = price_divs[i].find(class_='eHKyyH').text
                        except:
                            price = price_divs[i].find(class_='ldlHGe').text
                        price = int(price.replace('US$', '').replace(',',''))
                    except:
                        price = None
                    result = {
                        'artist_slug': artist,
                        'title': title,
                        'image_url': image_url,
                        'medium': medium,
                        'dimensions': dimensions,
                        'estimate': estimate,
                        'realized_price': realized_price,
                        'price': price
                    }

                    auction_results.append(result)
                    print(f'{count} total objects scraped, {a+1} / {len(artists)} artists searched', end='\r')
                    count += 1

                try:

                    driver.find_element(By.LINK_TEXT, 'Next').click()

                    element = wait.until(element_not_have_css_class((By.CLASS_NAME, 'beISLe'), "loading"))
                except:
                    break
        except:
            pass

    driver.close() 
    
    return auction_results

In [5]:
import pandas as pd

df = pd.read_csv('artsy_data_full.csv')

In [6]:
artists = df.artist_slug.unique()

In [7]:
len(artists)

4750

In [None]:
driver = initialize_driver()
print('waiting to scrape...')
time.sleep(10)
print('scraping...')
results = auction_scrape(driver, artists)
print('scraping complete.')

initializing driver...
loaded artsy.net, waiting to log in...
logging in...
log in sucessful...
waiting to scrape...
scraping...
28871 total objects scraped, 94 / 4750 artists searched

In [None]:
import pandas as pd

results_df = pd.DataFrame(results)

In [None]:
results_df.info()

In [None]:
results_df.describe().T

In [None]:
results_df.query('medium == "Painting"').describe().T

In [None]:
results_df.query('medium == "Painting"').groupby('artist_slug').describe()

In [None]:
results_df.query('medium == "Painting"').dropna(subset=['title']).isna().sum()

In [None]:
results_df.query('artist_slug == "leonardo-da-vinci"').sort_values('price')

In [None]:
results_df.dropna(subset=['title'])

In [None]:
import numpy as np

np.log(results_df.price).hist()

In [None]:
#need to get rid of login info