In [1]:
import sqlite3
from requests import get
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import re
import math
from json import loads
from IPython.core.display import clear_output

In [2]:
#Interface with SQL
def run_query(DB, q):
    with sqlite3.connect(DB) as conn:
        return pd.read_sql(q,conn)

def run_command(DB, c):
    with sqlite3.connect(DB) as conn:
        conn.execute('PRAGMA foreign_keys = ON;')
        conn.isolation_level = None
        conn.execute(c)
        
def run_inserts(DB, c, values):
    with sqlite3.connect(DB) as conn:
        conn.execute('PRAGMA foreign_keys = ON;')
        conn.isolation_level = None
        conn.execute(c, values) 

In [3]:
def scrape_gpu_specs():
    DB = 'gpudata.db'
    pages = pcpartpicker.lists.total_pages("video-card")
    start_time = time.time()
    
    insert_query = '''
    INSERT OR IGNORE INTO gpu_specs(
        item_id,
        name,
        series,
        chipset,
        memory,
        core_clock,
        ratings
    ) 
    VALUES (?, ?, ?, ?, ?, ?, ?)
    '''
    
    for i in range(1, pages+1):
        gpu_info = pcpartpicker.lists.get_list("video-card", i)

        for gpu in gpu_info:
            item_id = gpu['id']
            name = gpu['name']
            series = gpu['series']
            chipset = gpu['chipset']
            memory = gpu['memory']
            core_clock = gpu['core-clock']
            ratings = gpu['ratings']
            if gpu['price'] == '':
                print('No price found for item_id: {0}'.format(item_id))
                continue
                
            try:
                run_inserts(DB, insert_query,(
                    item_id, name, series, chipset, memory, core_clock, \
                    int(ratings), 
                    )
                )
            except Exception as e:
                print('Failed to add into DB for item_id: {0}, {1}'.format(item_id, e))
                pass
            
        #Provide stats for monitoring
        current_time = time.time()
        elapsed_time = current_time - start_time
        requests = i
        print('-------------------')
        print('Requests Completed: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        print('Elapased Time: {} minutes'.format(elapsed_time/60))
        if requests == pages:
            clear_output(wait = True)
            print('Scrape Complete')
            break
        print('Pausing...')    
        time.sleep(random.uniform(5, 10))   
        clear_output(wait = True)

In [22]:
scrape_gpu_specs()

Scrape Complete


In [None]:
#https://pcpartpicker.com/product/3fvZxr?history_days=730

In [4]:
gpu_id = "7xKhP6"
prefix = ''
url = 'http://{0}pcpartpicker.com/product/{1}?history_days=730'.format(prefix, gpu_id)
url

'http://pcpartpicker.com/product/7xKhP6?history_days=730'

In [5]:
response = get(url=url)

In [6]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
scripts = html_soup.findAll('script')
scripts

[<script type="text/javascript">window.NREUM||(NREUM={}),__nr_require=function(e,t,n){function r(n){if(!t[n]){var o=t[n]={exports:{}};e[n][0].call(o.exports,function(t){var o=e[n][1][t];return r(o||t)},o,o.exports)}return t[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var o=0;o<n.length;o++)r(n[o]);return r}({1:[function(e,t,n){function r(){}function o(e,t,n){return function(){return i(e,[f.now()].concat(u(arguments)),t?null:this,n),t?void 0:this}}var i=e("handle"),a=e(2),u=e(3),c=e("ee").get("tracer"),f=e("loader"),s=NREUM;"undefined"==typeof window.newrelic&&(newrelic=s);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],d="api-",l=d+"ixn-";a(p,function(e,t){s[t]=o(d+t,!0,"api")}),s.addPageAction=o(d+"addPageAction",!0),s.setCurrentRouteName=o(d+"routeName",!0),t.exports=newrelic,s.interaction=function(){return(new r).get()};var m=r.prototype={createTracer:function(e,t){var n={},r=this,o="function

In [8]:
scripts = html_soup.findAll('script')
for script in scripts:
    if 'phistmulti' in script.text:
        data = script.prettify().split('\n')
        for line in data:
            if 'phistmulti' in line:
                idx = line.index('[')
                price_history = line[idx:-1]
                price_data = loads(price_history)
                break

In [9]:
for merchant in price_data:
    print('-----')
    print(merchant['label'])
    print('-----')
    print(merchant['data'])
    

-----
Newegg Marketplace
-----
[[1465884410000, None], [1505253502000, 79999], [1509550280000, 75199], [1510183998000, 75199], [1510183998000, None], [1510349345000, 75199], [1513296009000, 75199], [1513296009000, None], [1513371051000, 75199], [1513810882000, 82017], [1513897171000, 83381], [1513978793000, 79999], [1514064740000, 89472], [1514159244000, 89472], [1514159244000, None], [1516653073000, 129999], [1516742998000, 119900], [1516919642000, 129999], [1516998723000, 139999], [1517088671000, 129900], [1517517355000, 119900], [1517607092000, 138687], [1517693403000, 129900], [1518127643000, 119999], [1518299555000, 114900], [1518560391000, 112999], [1519608158000, 128999], [1519688928000, 128998], [1519863720000, 128797], [1520040971000, 128697], [1520212288000, 114999], [1520300651000, 125487], [1520387109000, 114999], [1520471542000, 114995], [1520722323000, 119995], [1520893726000, 117999], [1521064989000, 119995], [1521411621000, 105900], [1521756502000, 119995], [15237402790

In [23]:
first_business = price_data[1]
merchant_name = first_business['label']
merchant_name

'Newegg Business'

In [150]:
DB = 'gpudata.db'
select_query_merchants = 'SELECT * FROM merchants'
temp_merchants_table = run_query(DB, select_query_merchants)
temp_merchants_table

Unnamed: 0,merchant_id,merchant_name
0,1,Newegg Business


In [147]:
if merchant_name not in temp_merchants_table['merchant_name'].values:
    run_inserts(DB, insert_query2,[(merchant_name)])
else:
    print(1)

1


In [135]:
insert_query2 = '''
INSERT OR IGNORE INTO merchants(
        merchant_name
) 
VALUES (?)
'''

In [136]:
print(insert_query2)


INSERT OR IGNORE INTO merchants(
        merchant_name
) 
VALUES (?)



In [159]:
merchant_id = temp_merchants_table[temp_merchants_table['merchant_name'] == merchant_name]['merchant_id'].values
merchant_id[0]

1

In [42]:
for timepoints in first_business['data']:
    print(timepoints)
    print('-')

[1465707298000, None]
-
[1521696786000, 91498]
-
[1521786797000, 91498]
-
[1521786797000, None]
-
[1524030499000, 95498]
-
[1524117359000, 95498]
-
[1524117359000, None]
-
[1528004706000, 75498]
-
[1528779298000, 75498]
-


In [36]:
second_price = first_business['data'][1][1]
second_price

91498

In [172]:
def scrape_gpu_prices():
    DB = 'gpudata.db'
    prefix = ''
    start_time = time.time()
    
    insert_query_prices = '''
    INSERT OR IGNORE INTO gpu_prices(
        item_id,
        merchant_id,
        datetime,
        price
    ) 
    VALUES (?, ?, ?, ?)
    '''

    insert_query_merchants = '''
    INSERT OR IGNORE INTO merchants(
        merchant_name
    ) 
    VALUES (?)
    '''
    
    #Pulls merchants table from the database
    select_query_merchants = 'SELECT * FROM merchants'
    temp_merchants_table = run_query(DB, select_query_merchants)
    
    #Pulls item_id table from the table gpu_specs
    pull_ids = 'SELECT item_id FROM gpu_specs'
    item_ids = run_query(DB, pull_ids)['item_id']
    
    for counter, item_id in enumerate(item_ids):
        url = 'http://{0}pcpartpicker.com/product/{1}?history_days=730'.format(prefix, item_id)
        response = get(url=url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        
        #Search for the raw data
        scripts = html_soup.findAll('script')
        for script in scripts:
            if 'phistmulti' in script.text:
                data = script.prettify().split('\n')
                for line in data:
                    if 'phistmulti' in line:
                        idx = line.index('[')
                        price_history = line[idx:-1]
                        price_data = loads(price_history)
                        break
        
        #Extracts price/merchant data
        for merchant in price_data:
            
            #Creates a row in the table 'merchants' if merchant_name doesn't exist
            merchant_name = merchant['label']
            if merchant_name != 'No price history is available for this time period.':
                if merchant_name not in temp_merchants_table['merchant_name'].values:
                    try:
                        run_inserts(DB, insert_query2,[(merchant_name)])
                    except Exception as e:
                        print('Failed to add into DB for {0}, {1}'.format(merchant_name, e))
                        pass
                    
                    #Updates the temp_table
                    temp_merchants_table = run_query(DB, select_query_merchants)
                
            for date_points in merchant['data']:
                datetime = date_points[0]
                price = date_points[1]
                merchant_id = temp_merchants_table[temp_merchants_table['merchant_name'] == merchant_name]['merchant_id'].values[0]

                try:
                    run_inserts(DB, insert_query_prices,(
                        item_id, int(merchant_id), float(datetime/1000.00), float(price/100.00) 
                        )
                    )
                except Exception as e:
                    print('Failed to add into DB for item_id: {0}, datetime: {1}, merchant: {2}, {3}'.format(item_id, datetime, merchant_id, e))
                    pass
                
                
        #Provide stats for monitoring
        current_time = time.time()
        elapsed_time = current_time - start_time
        requests = counter + 1
        print('-------------------')
        print('Requests Completed: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        print('Elapased Time: {} minutes'.format(elapsed_time/60))
        if requests == len(item_ids):
            clear_output(wait = True)
            print('Scrape Complete')
            break
        print('Pausing...')    
        time.sleep(random.uniform(8, 15))   
        clear_output(wait = True)

In [173]:
scrape_gpu_prices()

Scrape Complete


In [None]:
url = 'http://{0}pcpartpicker.com/product/{1}?history_days={2}'.format('', 'J3tWGX', 730)
headers ={"User-Agent": "gpudata web scraper for research, contact me at https://codingdisciple.com"}
response = get(url=url, headers=headers)
html_soup = BeautifulSoup(response.text, 'html.parser')

In [1]:
specs_block = html_soup.find('div', class_='specs block')

NameError: name 'html_soup' is not defined

In [None]:
specs_block.find_all('h4')