In [1]:
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import os
import re
from dateutil.parser import parse


In [23]:
TRANSACTION_FILE_PATH = 'logs/transaction.csv'
CERTIFICATION_FILE_PATH = 'logs/certificate.csv'

In [24]:
def get_particular_unique_rows(file_path, index):
    rows = []
    with open(file_path) as csvfile:
        data = csv.reader(csvfile, delimiter=',')
        for row in data:
            if row[index] not in rows:
                rows.append(row[index])
    return rows

In [28]:
existing_certificates_in_transaction = set(get_particular_unique_rows(TRANSACTION_FILE_PATH, 11))
existing_certificates_in_certificates = set(get_particular_unique_rows(CERTIFICATION_FILE_PATH, 0))

new_certificates = existing_certificates_in_transaction - existing_certificates_in_certificates

In [26]:
set(existing_certificates_in_transaction) - existing_certificates_in_certificates

['27306384']

In [31]:
for x in new_certificates:
    print (x)

27306384


In [None]:
topps-magic-photos-all-american-basketball,manhattan,$71.00,633276,7/25/2017,71.00,6,362042965071,eBay,probstein123,Auction,27306384,Most Recent Price,1948 Topps Magic Photos All-American Basketball Manhattan #6B All American Basketball,https://www.psacard.com/auctionprices/basketball-cards/1948-topps-magic-photos-all-american-basketball/manhattan/summary/633276,1,basketball_cards
topps-magic-photos-all-american-basketball,manhattan,$71.00,633276,7/25/2017,71.00,6,362042965071,eBay,probstein123,Auction,27306384,Average Price,1948 Topps Magic Photos All-American Basketball Manhattan #6B All American Basketball,https://www.psacard.com/auctionprices/basketball-cards/1948-topps-magic-photos-all-american-basketball/manhattan/summary/633276,1,basketball_cards

In [16]:
def load_driver(SELENIUM_EXECUTABLE_PATH=r'/mnt/c/Users/adity/Downloads/Chrome/geckodriver-v0.27.0-win64/geckodriver.exe'):
    driver = webdriver.Firefox(executable_path=SELENIUM_EXECUTABLE_PATH)
    return driver

def save_to_csv(data, SAVE_PATH, MODE):
    if not os.path.exists(SAVE_PATH.split('/')[0]):
        os.makedirs(SAVE_PATH.split('/')[0])

    fileWriter = csv.DictWriter(open(SAVE_PATH, MODE), data[0].keys(), delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    fileWriter.writerows(data)
    
def validate_date(date):
    try:
        parse(date)
        return True
    except ValueError:
        return False
    
def format_registry_url(str_msg):
    try:
        url_ref = ''
        if 'href' in str_msg:
            href_subset = str_msg[str_msg.index('href')+6:]
            url_ref = '/'.join(BASE_PATH.split('/')[:-1]) + href_subset[:href_subset.index('">')]
        return url_ref
    except:
        return None
    
def persist_certification_details(certificate_id, SAVE_PATH='logs/certificate.csv'):
    BASE_PATH='https://www.psacard.com/cert'
    url = '/'.join([BASE_PATH, certificate_id])
    driver = load_driver()
    driver.get(url)
    soup=BeautifulSoup(driver.page_source)
    
    certificate_details = soup.find_all("div", attrs={"class": "cert-container"})[0].findAll('tr')[2:]
    certificate_map = {'certificate_number': certificate_id}
    certificate_map['reverse_cert_number'] = certificate_details[0].contents[-1].contents[0]
    certificate_map['year'] = certificate_details[1].contents[-1].contents[0]
    certificate_map['brand'] = certificate_details[2].contents[-1].contents[0]
    certificate_map['sport'] = certificate_details[3].contents[-1].contents[0]
    certificate_map['card_number'] = certificate_details[4].contents[-1].contents[0]
    certificate_map['player'] = certificate_details[5].contents[-1].contents[0]
    certificate_map['variety_or_pedigree'] = certificate_details[6].contents[-1].contents[0]
    certificate_map['grade'] = certificate_details[7].contents[-1].contents[0]
    
    # Adding PSA Auction Prices Realized
    realized_auction_prices = soup.find_all('table', attrs={"class": "apritem-results"})[0].findAll('tr')
    date = realized_auction_prices[0].contents[-1].contents[0]
    if validate_date(date):
        certificate_map['date'] = date
        certificate_map['price'] = realized_auction_prices[1].contents[-1].contents[0]
        certificate_map['auction_house'] = realized_auction_prices[2].contents[-1].contents[0]
        certificate_map['lot_number'] = realized_auction_prices[3].contents[-1].contents[0]
    else:
        certificate_map['date'] = certificate_map['price'] = certificate_map['auction_house'] = certificate_map['lot_number'] = None

    # Adding current PSA registry sets
    registry_sets = soup.find_all('div', attrs={"class": "col-xs-12"})
    certificate_map['registry_set_msg'] = str(soup.find_all('p')[3].contents)
    certificate_map['registry_set_url'] = format_registry_url(certificate_map['registry_set_msg'])
    certificate_map['population'] = registry_sets[0].find('span').contents[0]
    certificate_map['population_w_equal'] = registry_sets[1].find('span').contents[0]
    certificate_map['population_higher'] = registry_sets[2].find('span').contents[0]
    
    print (certificate_map)
    
    # Save to CSV
    save_to_csv([certificate_map], SAVE_PATH, 'a')
    return

In [None]:
certificate_id = '41599722'
BASE_PATH='https://www.psacard.com/cert'
url = '/'.join([BASE_PATH, certificate_id])
driver = load_driver()
driver.get(url)
soup=BeautifulSoup(driver.page_source)

In [17]:
persist_certification_details('41599722')

{'certificate_number': '41599722', 'reverse_cert_number': 'Yes', 'year': '1948', 'brand': 'TOPPS MAGIC PHOTOS ALL-AMERICAN BASKETBALL', 'sport': 'BASKETBALL CARDS', 'card_number': '2B', 'player': 'MURRAY WIER', 'variety_or_pedigree': 'ALL AMERICAN BASKETBALL', 'grade': 'EX 5', 'date': '12/23/2018', 'price': '$10.61', 'auction_house': 'eBay (just_collect)', 'lot_number': <a href="/auctionprices/auction/2200741" target="_blank">382680512913</a>, 'registry_set_msg': "['According to the PSA database, the requested certification number is defined as the following:']", 'registry_set_url': '', 'population': '4', 'population_w_equal': '0', 'population_higher': '2'}


In [None]:
certificate_map

In [None]:
m1 = str(soup.find_all('p')[3].contents)
m1

In [None]:
format_registry_message(m1)

In [None]:
'/'.join(BASE_PATH.split('/')[:-1])