In [1]:
#Import packages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
import os
from selenium.common.exceptions import TimeoutException

In [2]:
#Reference the current working directory
cwd = os.getcwd()

## Create functions

In [3]:
def page_scroller(browser_driver, num_scrolls, wait_time):
    for i in range(num_scrolls):
        browser_driver.execute_script("window.scrollBy(0, 2000);")
        sleep(wait_time)

In [4]:
def get_product_details(browser_driver, product_url_list):
    #Create a list for dictionaries
    prods = []
    sales = []
    error_prods = []
    
    print(str(len(product_url_list))+' products')
    for url in product_url_list:
        #Random sleep time
        sleep(randint(3,6))
        
        #Create an empty dictionary of results
        prod_details = {}
        
        
        #Add the url to the dictionary
        prod_details['url'] = url
        
        #Go to the item page
        print(url)
        
        try:
            driver.get(url = url)
        except TimeoutException:
            prod_details['status'] = 'Could not retrieve product information'
            error_prods.append(url)
        else:
            sleep(3)
            #Get the html from the link
            soup = BeautifulSoup(driver.page_source, 'html.parser')


            #Get product details and add to dictionary
            #Get the name
            product_name = soup.find('h1', class_ = 'name').text
            prod_details['name'] = product_name
            #print(product_name)

            #Get the other details
            #Some products have a product details column, others have a product details row
            if soup.find('div', class_ = 'product-details detail-column') != None:
                prod_info_section = soup.find('div', class_ = 'product-details detail-column').find_all('div', class_ = 'detail')
            elif soup.find('div', class_ = 'product-details detail-row') != None:    
                prod_info_section = soup.find('div', class_ = 'product-details detail-row').find_all('div', class_ = 'detail')
            else:
                prod_details['status'] = 'Could not retrieve product information'

            #Loop thorugh all of the product details in that section of the page
            for info_item in prod_info_section:
                info_label = info_item.find('span', class_ = 'title').text
                info_value = info_item.find('span', class_ = False).text

                #Add to dictionary
                prod_details[info_label] = info_value

            prods.append(prod_details)





            ######## GET HISTORIC SALES #######
            #Find the view all sales button using the link text
            driver.find_element_by_link_text("View All Sales").click()
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            historic_sales_head = soup.find('div', class_ = "latest-sales-container").thead
            historic_sales_body = soup.find('div', class_ = "latest-sales-container").tbody

            #Get the headings for the sales section
            if historic_sales_body is not None:
                all_header = historic_sales_head.find_all('th')
                all_sales = historic_sales_body.find_all('tr')

                #Make sure atleast one sale has occured

                #Do this for every sale in the sales
                for sale in all_sales:
                    #Get a sales record
                    prod_sales = {}
                    prod_sales['url'] = item_link
                    all_variables = sale.find_all('td')

                    for header, value in zip(all_header, all_variables):
                        prod_sales[header.text] = value.text
                    #Append to list    
                    sales.append(prod_sales)    


    
    #Return the list of results
    return prods, sales, error_prods    

## Login
Logging in is required to get unrestricted access to historic sales

In [5]:
#Instantiate a firefix session - Requires geckodriver in the current working directory
driver = webdriver.Safari()
sleep(2)

In [6]:
#Load credentials from .py file
# from credentials import username, password

In [7]:
pw = open('passwords.txt').read().splitlines()

In [8]:
username = pw[0]
password = pw[1]

In [9]:
#Navigate to login page
driver.get(url = "https://stockx.com/login")

In [10]:
from selenium.webdriver.common.keys import Keys
email = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "email-login")))
email.send_keys(username)
key = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "password-login")))
key.send_keys(password)
key.send_keys(Keys.RETURN)

## Get products from a search
This section can be bypassed if you have list of product urls you want to retrieve information from


In [11]:
#Replace search_term with whatever you're looking for
#Spaces need to be replaced with %20
search_term = "yeezy 350"
search_term_converted = search_term.replace(' ', '%20')

In [12]:
#Search
driver.get(url = "https://stockx.com/search?s="+search_term_converted)

In [13]:
#Scroll down to load all items - Use a large number if you expect many results
page_scroller(driver, 10, 3)

In [14]:
#Get the html from the page
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [15]:
#Get the results grid
search_result_grid=  soup.find('div', class_ = "browse-grid")

In [16]:
#Get the results tiles
search_result_tiles = search_result_grid.find_all('div', class_ = "tile css-1bonzt1 e1yt6rrx0")

In [17]:
#Get urls for each tile
search_urls = []

for tile in search_result_tiles:
    link = tile.find('a')
    item_link_href = link['href']
    item_link = 'https://stockx.com/'+ item_link_href
    
    search_urls.append(item_link)

## Get the details from the products

If you already have a list of urls you want to get information for, just create a list and use it in the function below

In [18]:
prod_info_list, sales_info_list, not_found = get_product_details(driver, search_urls)

40 products
https://stockx.com//adidas-yeezy-boost-350-v2-asriel
https://stockx.com//adidas-yeezy-boost-350-v2-zyon
https://stockx.com//adidas-yeezy-boost-350-v2-cinder
https://stockx.com//adidas-yeezy-boost-350-v2-desert-sage
https://stockx.com//adidas-yeezy-boost-350-v2-white-core-black-red
https://stockx.com//adidas-yeezy-boost-350-v2-yecheil
https://stockx.com//adidas-yeezy-boost-350-v2-earth
https://stockx.com//adidas-yeezy-boost-350-v2-yeezreel
https://stockx.com//adidas-yeezy-boost-350-v2-tail-light
https://stockx.com//adidas-yeezy-boost-350-v2-cream-white
https://stockx.com//adidas-yeezy-boost-350-v2-citrin
https://stockx.com//adidas-yeezy-boost-350-v2-cloud-white
https://stockx.com//adidas-yeezy-boost-350-v2-linen
https://stockx.com//adidas-yeezy-boost-350-v2-sesame
https://stockx.com//adidas-yeezy-boost-350-v2-semi-frozen-yellow
https://stockx.com//adidas-yeezy-boost-350-v2-sulfur
https://stockx.com//adidas-yeezy-boost-350-v2-clay
https://stockx.com//adidas-yeezy-boost-350-v2

In [19]:
#Create dataframes
prods_df =  pd.DataFrame.from_records(prod_info_list)
sales_df =  pd.DataFrame.from_records(sales_info_list)

In [20]:
prods_df.head()

Unnamed: 0,url,name,Style,Colorway,Retail Price,Release Date
0,https://stockx.com//adidas-yeezy-boost-350-v2-...,adidas Yeezy Boost 350 V2 Carbon,FZ5000,Carbon/Carbon/Carbon,$220,10/02/2020
1,https://stockx.com//adidas-yeezy-boost-350-v2-...,adidas Yeezy Boost 350 V2 Zyon,FZ1267,Zyon/Zyon/Zyon,$220,07/18/2020
2,https://stockx.com//adidas-yeezy-boost-350-v2-...,adidas Yeezy Boost 350 V2 Cinder,FY2903,Cinder/Cinder/Cinder,$220,03/21/2020
3,https://stockx.com//adidas-yeezy-boost-350-v2-...,adidas Yeezy Boost 350 V2 Desert Sage,FX9035,Desert Sage/Desert Sage/Desert Sage,$220,03/14/2020
4,https://stockx.com//adidas-yeezy-boost-350-v2-...,adidas Yeezy Boost 350 V2 Zebra,CP9654,,$220,02/25/2017


In [21]:
sales_df.head()

Unnamed: 0,url,Size,Sale Price,Date,Time
0,https://stockx.com//adidas-yeezy-boost-350-v2-...,6.0,$293,"Monday, October 26, 2020",3:29 PM EST
1,https://stockx.com//adidas-yeezy-boost-350-v2-...,12.0,$251,"Monday, October 26, 2020",3:29 PM EST
2,https://stockx.com//adidas-yeezy-boost-350-v2-...,8.5,$286,"Monday, October 26, 2020",3:29 PM EST
3,https://stockx.com//adidas-yeezy-boost-350-v2-...,8.0,$262,"Monday, October 26, 2020",3:21 PM EST
4,https://stockx.com//adidas-yeezy-boost-350-v2-...,6.5,$292,"Monday, October 26, 2020",3:21 PM EST


In [22]:
prods_df.to_excel("prods.xlsx")
sales_df.to_excel(os.getcwd() + '/sales.xlsx')

In [23]:
os.getcwd()

'/Users/fernandocueva/Desktop/sneakerTimeSeries/stockx_scraper-master'

In [24]:
fernando = prods_df

In [25]:
fernando.to_csv('fernando.csv', index=False)