In [1]:
import api_keys
import requests
import time as time
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
client_id = api_keys.client_id
client_secret = api_keys.client_secret

In [3]:
class ArtsyScraper:
    
    def __init__(self, client_id, client_secret):
        
        auth_token_uri = 'https://api.artsy.net/api/tokens/xapp_token'

        self.api_url = 'https://api.artsy.net/api'

        token = requests.post(auth_token_uri, params={'client_id': client_id, 'client_secret': client_secret})
        self.token = token.json()['token']
        
        return None
    
    def scrape_api(self):
        self._api_scraper()
        
    def scrape_auctions(self):
        self.auction_results = self._auction_scraper()
        
    def make_slugs(self):
        self.artist_slugs = self.results_df.artist_slug.unique()

    def _api_scraper(self):
        time_split = .5
        
        self.results = []
        
        endpoint = self.api_url + '/artworks/'

        is_next = True
        
        keys_to_keep = ['id', 'slug', 'title', 'category', 'medium', 'date', 'iconicity', 'sold', 'image_versions']
        count = 0
        bad = 0
        
        r = requests.get(endpoint, headers={'X-Xapp-Token': self.token})
        rjson = r.json()
       
        while(is_next and count < 10000):
            try:
                r = requests.get(endpoint, headers={'X-Xapp-Token': self.token})
                rjson = r.json()

                artworks = rjson['_embedded']['artworks']

                for artwork in artworks:
                    
                    artwork_result = {key: artwork[key] for key in artwork.keys() if key in keys_to_keep}
                    try:
                        artwork_result['dimensions_height_in'] = artwork['dimensions']['in']['height']
                    except:
                        artwork_result['dimensions_height_in'] = float('nan')
                    try:
                        artwork_result['dimensions_width_in'] = artwork['dimensions']['in']['width']
                    except:
                        artwork_result['dimensions_width_in'] = float('nan')
                    try:
                        artwork_result['dimensions_depth_in'] = artwork['dimensions']['in']['depth']
                    except:
                        artwork_result['dimensions_depth_in'] = float('nan')
                    try:
                        artwork_result['permalink'] = artwork['_links']['permalink']['href']
                    except:
                        artwork_result['permalink'] = float('nan')
                    try:
                        artwork_result['api_link'] = artwork['_links']['self']['href']
                    except:
                        artwork_result['api_link'] = float('nan')
                    try:
                        artwork_result['thumb_link'] = artwork['_links']['thumbnail']['href']
                    except:
                        artwork_result['thumb_link'] = float('nan')
                    try:
                        artwork_result['image_link'] = artwork['_links']['image']['href']
                    except:
                        artwork_result['image_link'] = float('nan')
                    try:
                        is_next_genes = True
                        gene_api_url = artwork['_links']['genes']['href']
                        genes = []
                        while(is_next_genes):
                            genes_request = requests.get(gene_api_url, headers={'X-Xapp-Token': self.token})
                            for gene in genes_request.json()['_embedded']['genes']:
                                genes.append(gene['name'])
                            try:
                                gene_api_url = genes_request.json()['_links']['next']['href']
                                time.sleep(time_split)
                            except:
                                is_next_genes = False

                        artwork_result['genes'] = genes
                    except: 
                        artwork_result['genes'] = float('nan')
                        
                    try:    
                        artist_href = artwork['_links']['artists']['href']
                        artist_request = requests.get(artist_href, headers={'X-Xapp-Token': self.token}, params={'total_count':1})
                        artist_json = artist_request.json()['_embedded']['artists'][0]
                        artwork_result['artist'] = artist_json['name']
                        artwork_result['artist_slug'] = artist_json['slug']
                        artwork_result['artist_id'] = artist_json['id']
                    except:
                        artwork_result['artist'] = float('nan')
                        artwork_result['artist_slug'] = float('nan')
                        artwork_result['artist_id'] = float('nan')

                    self.results.append(artwork_result)
                    print(f'{count+1} works collected. {bad} bad scrapes', end='\r')
                    count+=1

                    time.sleep(time_split)
            except:
                bad += 1
            try:
                endpoint = rjson['_links']['next']['href']
            except:
                is_next = False
                break
            
        return self.results           
        
    


In [4]:
scraper = ArtsyScraper(client_id, client_secret)

In [None]:
scraper.scrape_api()

In [None]:
results_df = pd.DataFrame(scraper.results)

In [None]:
results_df.artist_slug.unique()

In [None]:
results_df

In [None]:
import pyarrow.feather as feather

In [None]:
results_df.to_csv('artsy_data_csv.csv')

In [None]:
feather.write_feather(results_df[['id','slug','genes']],'artsy_genes.feather')