In [1]:
import api_keys
import requests
import time as time
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
client_id = api_keys.client_id
client_secret = api_keys.client_secret

In [3]:
class ArtsyScraper:
    
    def __init__(self, client_id, client_secret):
        
        auth_token_uri = 'https://api.artsy.net/api/tokens/xapp_token'

        self.api_url = 'https://api.artsy.net/api'

        token = requests.post(auth_token_uri, params={'client_id': client_id, 'client_secret': client_secret})
        self.token = token.json()['token']
        
        return None
    
    def scrape_api(self):
        self._api_scraper()
        
    def scrape_auctions(self):
        self.auction_results = self._auction_scraper()
        
    def make_slugs(self):
        self.artist_slugs = self.results_df.artist_slug.unique()

    def _api_scraper(self):
        time_split = .5
        
        self.results = []
        
        endpoint = self.api_url + '/artworks/'

        is_next = True
        
        keys_to_keep = ['id', 'slug', 'title', 'category', 'medium', 'date', 'iconicity', 'sold', 'image_versions']
        count = 0
        bad = 0
        
        r = requests.get(endpoint, headers={'X-Xapp-Token': self.token})
        rjson = r.json()
       
        while(is_next and count < 10000):
            try:
                r = requests.get(endpoint, headers={'X-Xapp-Token': self.token})
                rjson = r.json()

                artworks = rjson['_embedded']['artworks']

                for artwork in artworks:
                    
                    artwork_result = {key: artwork[key] for key in artwork.keys() if key in keys_to_keep}
                    try:
                        artwork_result['dimensions_height_in'] = artwork['dimensions']['in']['height']
                    except:
                        artwork_result['dimensions_height_in'] = float('nan')
                    try:
                        artwork_result['dimensions_width_in'] = artwork['dimensions']['in']['width']
                    except:
                        artwork_result['dimensions_width_in'] = float('nan')
                    try:
                        artwork_result['dimensions_depth_in'] = artwork['dimensions']['in']['depth']
                    except:
                        artwork_result['dimensions_depth_in'] = float('nan')
                    try:
                        artwork_result['permalink'] = artwork['_links']['permalink']['href']
                    except:
                        artwork_result['permalink'] = float('nan')
                    try:
                        artwork_result['api_link'] = artwork['_links']['self']['href']
                    except:
                        artwork_result['api_link'] = float('nan')
                    try:
                        artwork_result['thumb_link'] = artwork['_links']['thumbnail']['href']
                    except:
                        artwork_result['thumb_link'] = float('nan')
                    try:
                        artwork_result['image_link'] = artwork['_links']['image']['href']
                    except:
                        artwork_result['image_link'] = float('nan')
                    try:
                        is_next_genes = True
                        gene_api_url = artwork['_links']['genes']['href']
                        genes = []
                        while(is_next_genes):
                            genes_request = requests.get(gene_api_url, headers={'X-Xapp-Token': self.token})
                            for gene in genes_request.json()['_embedded']['genes']:
                                genes.append(gene['name'])
                            try:
                                gene_api_url = genes_request.json()['_links']['next']['href']
                                time.sleep(time_split)
                            except:
                                is_next_genes = False

                        artwork_result['genes'] = genes
                    except: 
                        artwork_result['genes'] = float('nan')
                        
                    try:    
                        artist_href = artwork['_links']['artists']['href']
                        artist_request = requests.get(artist_href, headers={'X-Xapp-Token': self.token}, params={'total_count':1})
                        artist_json = artist_request.json()['_embedded']['artists'][0]
                        artwork_result['artist'] = artist_json['name']
                        artwork_result['artist_slug'] = artist_json['slug']
                        artwork_result['artist_id'] = artist_json['id']
                    except:
                        artwork_result['artist'] = float('nan')
                        artwork_result['artist_slug'] = float('nan')
                        artwork_result['artist_id'] = float('nan')

                    self.results.append(artwork_result)
                    print(f'{count+1} works collected. {bad} bad scrapes', end='\r')
                    count+=1

                    time.sleep(time_split)
            except:
                bad += 1
            try:
                endpoint = rjson['_links']['next']['href']
            except:
                is_next = False
                break
            
        return self.results           
        
    


In [4]:
scraper = ArtsyScraper(client_id, client_secret)

In [5]:
scraper.scrape_api()

10000 works collected. 0 bad scrapes

In [6]:
results_df = pd.DataFrame(scraper.results)

In [7]:
results_df.artist_slug.unique()

array(['gustav-klimt', 'francisco-de-goya', 'rembrandt-van-rijn', ...,
       'johsua-johnson', 'john-martin-1789-1854', 'gertrude-kasebier'],
      dtype=object)

In [8]:
results_df

Unnamed: 0,id,slug,title,category,medium,date,iconicity,sold,image_versions,dimensions_height_in,dimensions_width_in,dimensions_depth_in,permalink,api_link,thumb_link,image_link,genes,artist,artist_slug,artist_id
0,4d8b92eb4eb68a1b2c000968,gustav-klimt-der-kuss-the-kiss,Der Kuss (The Kiss),Painting,Oil and gold leaf on canvas,1907-1908,319.933485,False,"[large, larger, medium, medium_rectangle, norm...",70.9000,70.9000,,https://www.artsy.net/artwork/gustav-klimt-der...,https://api.artsy.net/api/artworks/4d8b92eb4eb...,https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa...,https://d32dm0rphc51dk.cloudfront.net/NOpIAwQa...,"[1860–1969, 1900–1917, Abstract versus Figurat...",Gustav Klimt,gustav-klimt,4d8b92b64eb68a1b2c000414
1,4d8b92ee4eb68a1b2c0009ab,francisco-de-goya-the-third-of-may,The Third of May,Painting,Oil on canvas,1814,226.498892,False,"[square, small, large_rectangle, large, tall, ...",104.8000,135.9000,,https://www.artsy.net/artwork/francisco-de-goy...,https://api.artsy.net/api/artworks/4d8b92ee4eb...,https://d32dm0rphc51dk.cloudfront.net/m4X41Fun...,https://d32dm0rphc51dk.cloudfront.net/m4X41Fun...,"[19th Century, Chiaroscuro, Collective History...",Francisco de Goya,francisco-de-goya,4d8b92b44eb68a1b2c0003fe
2,4d8b93394eb68a1b2c0010fa,rembrandt-van-rijn-the-company-of-frans-bannin...,The Company of Frans Banning Cocq and Willem v...,Painting,Oil on canvas,1642,99.584929,False,"[large, large_rectangle, larger, medium, mediu...",143.0000,172.2000,,https://www.artsy.net/artwork/rembrandt-van-ri...,https://api.artsy.net/api/artworks/4d8b93394eb...,https://d32dm0rphc51dk.cloudfront.net/IG8ZLvVm...,https://d32dm0rphc51dk.cloudfront.net/IG8ZLvVm...,"[17th Century, Baroque, Chiaroscuro, Collectiv...",Rembrandt van Rijn,rembrandt-van-rijn,4d8b929c4eb68a1b2c0002e2
3,4d8b937c4eb68a1b2c001722,leonardo-da-vinci-mona-lisa,Mona Lisa,Painting,Oil on poplar,ca. 1503-1506,214.762778,False,"[large, large_rectangle, larger, medium, mediu...",30.3000,20.9000,,https://www.artsy.net/artwork/leonardo-da-vinc...,https://api.artsy.net/api/artworks/4d8b937c4eb...,https://d32dm0rphc51dk.cloudfront.net/5L1xjKC_...,https://d32dm0rphc51dk.cloudfront.net/5L1xjKC_...,"[Eye Contact, Figurative Art, Figurative Paint...",Leonardo da Vinci,leonardo-da-vinci,4d8b92684eb68a1b2c00009e
4,4d8b93b04eb68a1b2c001b9d,edouard-manet-luncheon-on-the-grass-le-dejeune...,Luncheon on the Grass (Le Déjeuner sur l'herbe),Painting,Oil on canvas,1863,241.372358,False,"[large, large_rectangle, larger, medium_rectan...",82.0000,104.0000,,https://www.artsy.net/artwork/edouard-manet-lu...,https://api.artsy.net/api/artworks/4d8b93b04eb...,https://d32dm0rphc51dk.cloudfront.net/zFA7cwdk...,https://d32dm0rphc51dk.cloudfront.net/zFA7cwdk...,"[1860–1969, 19th Century, Cultural Commentary,...",Édouard Manet,edouard-manet,4d8b927e4eb68a1b2c000168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,515ccaaab5907b33b1000526,john-martin-1789-1854-view-on-the-river-wye-lo...,"View on the River Wye, Looking towards Chepstow","Drawing, Collage or other Work on Paper","Watercolor, gouache, and touches of oil paint ...",1844,13.941991,False,"[large_rectangle, larger, medium, medium_recta...",12.0625,25.1875,,https://www.artsy.net/artwork/john-martin-1789...,https://api.artsy.net/api/artworks/515ccaaab59...,https://d32dm0rphc51dk.cloudfront.net/jw283TQr...,https://d32dm0rphc51dk.cloudfront.net/jw283TQr...,[19th Century],John Martin (1789-1854),john-martin-1789-1854,4f0648328501fa10ca000266
9996,515ccab77b7057eb4c0005e9,martin-johann-schmidt-saint-nicholas-rescuing-...,Saint Nicholas Rescuing Sailors,Print,Etching on laid paper,1771,13.152477,False,"[large, large_rectangle, larger, medium, norma...",9.4375,4.9375,,https://www.artsy.net/artwork/martin-johann-sc...,https://api.artsy.net/api/artworks/515ccab77b7...,https://d32dm0rphc51dk.cloudfront.net/5R6wpFU5...,https://d32dm0rphc51dk.cloudfront.net/5R6wpFU5...,[18th Century],Martin Johann Schmidt,martin-johann-schmidt,515b929d1b12b0244a000f65
9997,515ccaba7b7057eb4c0005f0,thomas-cole-sunrise-in-the-catskills,Sunrise in the Catskills,Painting,Oil on canvas,1826,31.297918,False,"[large, larger, medium, medium_rectangle, norm...",25.5000,35.5000,,https://www.artsy.net/artwork/thomas-cole-sunr...,https://api.artsy.net/api/artworks/515ccaba7b7...,https://d32dm0rphc51dk.cloudfront.net/5vzQP-dE...,https://d32dm0rphc51dk.cloudfront.net/5vzQP-dE...,"[19th Century, Americana, Antiquity as Subject...",Thomas Cole,thomas-cole,4eaef729bddaf700010098c1
9998,515ccabd7b70570a130005b6,antoine-jacquard-time,Time,Print,Engraving,,17.783457,False,"[large, large_rectangle, medium, normalized, s...",,,,https://www.artsy.net/artwork/antoine-jacquard...,https://api.artsy.net/api/artworks/515ccabd7b7...,https://d32dm0rphc51dk.cloudfront.net/n6rFOv5I...,https://d32dm0rphc51dk.cloudfront.net/n6rFOv5I...,[],Antoine Jacquard,antoine-jacquard,515b41cd9562c8fee200212d


In [18]:
results_df.isna().sum()

id                         0
slug                       0
title                      0
category                   1
medium                     2
date                       0
iconicity                  0
sold                       0
image_versions            44
dimensions_height_in    4904
dimensions_width_in     4991
dimensions_depth_in     9816
permalink                  0
api_link                   0
thumb_link               149
image_link                44
genes                      0
artist                  1006
artist_slug             1006
artist_id               1006
dtype: int64

In [9]:
import pyarrow.feather as feather

In [10]:
results_df.to_csv('artsy_data_csv.csv')

In [11]:
feather.write_feather(results_df[['id','slug','genes']],'artsy_genes.feather')