In [1]:
import sqlite3
from requests import get
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import re
import math
from json import loads
from IPython.core.display import clear_output
from PCPartPicker_API import pcpartpicker

In [2]:
#Interface with SQL
def run_query(DB, q):
    with sqlite3.connect(DB) as conn:
        return pd.read_sql(q,conn)

def run_command(DB, c):
    with sqlite3.connect(DB) as conn:
        conn.execute('PRAGMA foreign_keys = ON;')
        conn.isolation_level = None
        conn.execute(c)
        
def run_inserts(DB, c, values):
    with sqlite3.connect(DB) as conn:
        conn.execute('PRAGMA foreign_keys = ON;')
        conn.isolation_level = None
        conn.execute(c, values) 

In [3]:
def scrape_gpu_specs(DB = 'gpudata.db', sleep_min = 5, sleep_max = 15):
    start_time = time.time()
    pages = pcpartpicker.lists.total_pages("video-card")
    
    insert_query = '''
    INSERT OR IGNORE INTO gpu_specs(
        item_id,
        name,
        series,
        chipset,
        memory,
        core_clock,
        ratings
    ) 
    VALUES (?, ?, ?, ?, ?, ?, ?)
    '''
    
    for i in range(1, pages+1):
        gpu_info = pcpartpicker.lists.get_list("video-card", i)

        for gpu in gpu_info:
            item_id = gpu['id']
            name = gpu['name']
            series = gpu['series']
            chipset = gpu['chipset']
            memory = gpu['memory']
            core_clock = gpu['core-clock']
            ratings = gpu['ratings']
            if gpu['price'] == '':
                print('No price found for item_id: {0}'.format(item_id))
                continue
                
            try:
                run_inserts(DB, insert_query,(
                    item_id, name, series, chipset, memory, core_clock, \
                    int(ratings), 
                    )
                )
            except Exception as e:
                print('Failed to add into DB for item_id: {0}, {1}'.format(item_id, e))
                pass
            
        #Provide stats for monitoring
        current_time = time.time()
        elapsed_time = current_time - start_time
        requests = i
        print('-------------------')
        print('Requests Completed: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        print('Elapased Time: {} minutes'.format(elapsed_time/60))
        if requests == pages:
            print('Scrape Complete')
            break
        print('Pausing...')    
        time.sleep(random.uniform(sleep_min, sleep_max))   
        clear_output(wait = True)

In [10]:
def scrape_gpu_prices(DB = 'gpudata.db', prefix = '', history_days = 730, sleep_min = 5, sleep_max = 15):
    start_time = time.time()
    
    insert_query_prices = '''
    INSERT OR IGNORE INTO gpu_prices(
        item_id,
        merchant_id,
        datetime,
        price
    ) 
    VALUES (?, ?, ?, ?)
    '''

    insert_query_merchants = '''
    INSERT OR IGNORE INTO merchants(
        merchant_name
    ) 
    VALUES (?)
    '''
    
    #Pulls merchants table from the database
    select_query_merchants = 'SELECT * FROM merchants'
    temp_merchants_table = run_query(DB, select_query_merchants)
    
    #Pulls item_id table from the table gpu_specs
    pull_ids = 'SELECT item_id FROM gpu_specs'
    item_ids = run_query(DB, pull_ids)['item_id']
    
    for counter, item_id in enumerate(item_ids):
        
        #Makes a connection to the item webpage
        url = 'http://{0}pcpartpicker.com/product/{1}?history_days={2}'.format(prefix, item_id, history_days)
        headers ={"User-Agent": "gpudata web scraper for research, contact me at https://codingdisciple.com"}
        successful_connection = False
        connection_attempts = 0
        while not successful_connection:
            try:
                response = get(url=url, headers=headers)
                print('Connection successful.')
                successful_connection = True
            except:
                print('Connection unsuccessful, reconnecting...')
                connection_attempts += 1
                time.sleep(random.uniform(sleep_min, sleep_max))
                if connection_attempts == 10:
                    raise
                    
        html_soup = BeautifulSoup(response.text, 'html.parser')
        
        #Search for the raw data
        scripts = html_soup.findAll('script')
        for script in scripts:
            if 'phistmulti' in script.text:
                data = script.prettify().split('\n')
                for line in data:
                    if 'phistmulti' in line:
                        idx = line.index('[')
                        price_history = line[idx:-1]
                        price_data = loads(price_history)
                        break
        
        #Extracts price/merchant data
        for merchant in price_data:
            
            #Creates a row in the table 'merchants' if merchant_name doesn't exist
            merchant_name = merchant['label']
            if merchant_name != 'No price history is available for this time period.':
                if merchant_name not in temp_merchants_table['merchant_name'].values:
                    try:
                        run_inserts(DB, insert_query_merchants,[(merchant_name)])
                    except Exception as e:
                        print('Failed to add into DB for {0}, {1}'.format(merchant_name, e))
                        pass
                    
                    #Updates the temp_table
                    temp_merchants_table = run_query(DB, select_query_merchants)
                
            for date_points in merchant['data']:
                datetime = date_points[0]
                price = date_points[1]
                merchant_id = temp_merchants_table[temp_merchants_table['merchant_name'] == merchant_name]['merchant_id'].values[0]

                try:
                    run_inserts(DB, insert_query_prices,(
                        item_id, int(merchant_id), float(datetime/1000.00), float(price/100.00) 
                        )
                    )
                except Exception as e:
                    print('Failed to add into DB for item_id: {0}, datetime: {1}, merchant: {2}, {3}'.format(item_id, datetime, merchant_id, e))
                    pass
                
                
        #Provide stats for monitoring
        current_time = time.time()
        elapsed_time = current_time - start_time
        requests = counter + 1
        print('-------------------')
        print('Requests Completed: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        print('Elapased Time: {} minutes'.format(elapsed_time/60))
        if requests == len(item_ids):
            print('Scrape Complete')
            break
        print('Pausing...')    
        time.sleep(random.uniform(sleep_min, sleep_max))   
        clear_output(wait = True)

In [5]:
def scrape_all():
    scrape_gpu_specs()
    scrape_gpu_prices()

In [11]:
scrape_gpu_prices()

Connection successful.
Failed to add into DB for item_id: zwbkcf, datetime: 1465954772000, merchant: 1, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1492889081000, merchant: 1, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1493665982000, merchant: 1, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1495307839000, merchant: 1, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1510184977000, merchant: 1, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1513297033000, merchant: 1, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515800363000, merchant: 1, unsupported operand type(s) for /: 'NoneType' and 'float'
Fail

Failed to add into DB for item_id: zwbkcf, datetime: 1515206409000, merchant: 3, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515220917000, merchant: 3, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515228115000, merchant: 3, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515238914000, merchant: 3, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515455632000, merchant: 3, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515470030000, merchant: 3, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515484435000, merchant: 3, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for i

Failed to add into DB for item_id: zwbkcf, datetime: 1513262010000, merchant: 9, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1515681120000, merchant: 9, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1520087354000, merchant: 9, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1526190574000, merchant: 9, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1529026772000, merchant: 9, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1465954772000, merchant: 7, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for item_id: zwbkcf, datetime: 1496741575000, merchant: 7, unsupported operand type(s) for /: 'NoneType' and 'float'
Failed to add into DB for i