In [1]:
import random
import csv
import logging

from shop_crawler import *
from selenium_helper import *
import common_actors

from contextlib import contextmanager

### Prepare different urls to analyze

In [2]:
# All urls
all_urls = []
with open('../resources/pvio_vio_us_ca_uk_sample1.csv', 'r') as f:
    rows = csv.reader(f)
    for row in rows:
        url = row[0]
        if url:
            all_urls.append(url)

# Random sample urls
random.seed(1)
sample_urls = random.sample(all_urls, 100)

# Some good urls to analyze by hands
good_urls = [
    'curlebotanicals.com',
    'theglamourshop.com',
    'vape-fuel.com',
    'firstfitness.com',
    'sandlakedermatology.com',
    'getwaave.com',
    'dixieems.com',
    'jonessurgical.com',
    'srandd.com',
    'ambarygardens.com',
    'anabolicwarfare.com'
]


### Mock user and payment information

In [5]:
user_info = UserInfo(
    first_name = 'John',
    last_name = 'Smith',
    country = 'United States',
    home = 34,
    street = 'Ocean drive',
    city = 'Miami',
    zip = '33125',
    state = 'FLorida',
    
    phone = '1231232',
    email = 'john@service.com'
)

billing_info = PaymentInfo(
    card_number = '1413232312312321',
    expire_date_year = 2020,
    expire_date_month = 12,
    card_name = 'Visa Card',
    card_type = 'Visa',
    cvc = '123'
)

In [11]:
selenium_path = '/usr/bin/chromedriver'

@contextmanager
def get_crawler(headless=True):
    global user_info, billing_info, selinium_path
    crawler = ShopCrawler(user_info, billing_info, selenium_path, headless=headless)
    common_actors.add_crawler_extensions(crawler)
    
    yield crawler

def get_driver(headless=True):
    global selenium_path   
    return create_chrome_driver(selenium_path, headless=headless)


### Set up logging level

In [12]:
logger = logging.getLogger('shop_crawler')
logger.setLevel(logging.DEBUG)

handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

### Start crawling for every url

In [13]:
results = []
with get_crawler(headless=False) as crawler:
    for url in sample_urls:
        print('\n\n{}'.format(url))
        status = crawler.crawl(url, 30)
        print(status)
        results.append(status)




vaporworld.biz


2018-07-15 03:48:34,491 shop_crawler INFO     processing state: new
2018-07-15 03:48:34,491 shop_crawler INFO     processing state: new
2018-07-15 03:48:34,498 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:48:34,498 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:48:34,626 shop_crawler INFO     new_state new, url http://vaporworld.biz/
2018-07-15 03:48:34,626 shop_crawler INFO     new_state new, url http://vaporworld.biz/
2018-07-15 03:48:34,632 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:48:34,632 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:49:05,126 shop_crawler INFO     new_state product_page, url https://www.vaporsupply.com/categories.php
2018-07-15 03:49:05,126 shop_crawler INFO     new_state product_page, url https://www.vaporsupply.com/categories.php
2018-07-15 0

2018-07-15 03:51:02,873 shop_crawler INFO     new_state checkout_page, url https://www.vaporsupply.com/login.php
2018-07-15 03:51:02,958 shop_crawler INFO     processing state: checkout_page
2018-07-15 03:51:02,958 shop_crawler INFO     processing state: checkout_page
2018-07-15 03:51:02,961 shop_crawler INFO     handler <common_actors.PaymentFields object at 0x7fddce0b2e10>
2018-07-15 03:51:02,961 shop_crawler INFO     handler <common_actors.PaymentFields object at 0x7fddce0b2e10>
2018-07-15 03:51:03,021 shop_crawler DEBUG    Not found select options!
2018-07-15 03:51:03,021 shop_crawler DEBUG    Not found select options!
2018-07-15 03:51:03,062 shop_crawler INFO     new_state checkout_page, url https://www.vaporsupply.com/login.php
2018-07-15 03:51:03,062 shop_crawler INFO     new_state checkout_page, url https://www.vaporsupply.com/login.php


Status: "Processing Finished at State" after processing url "vaporworld.biz"


cvsciences.com


2018-07-15 03:51:14,132 shop_crawler INFO     processing state: new
2018-07-15 03:51:14,132 shop_crawler INFO     processing state: new
2018-07-15 03:51:14,136 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:51:14,136 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:51:15,862 shop_crawler INFO     new_state new, url https://cvsciences.com/
2018-07-15 03:51:15,862 shop_crawler INFO     new_state new, url https://cvsciences.com/
2018-07-15 03:51:15,865 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:51:15,865 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:51:20,428 shop_crawler INFO     new_state new, url https://cvsciences.com/
2018-07-15 03:51:20,428 shop_crawler INFO     new_state new, url https://cvsciences.com/
2018-07-15 03:51:20,432 shop_crawler INFO     handler <common_acto

Status: "Processing Finished at State" after processing url "cvsciences.com"


walkinmycloset.com
Status: "Not Available" after processing url "http://walkinmycloset.com"
 Domain walkinmycloset.com for sale


blackrosevapes.com
Status: "Time Out" after processing url "http://blackrosevapes.com"


fourriversbirdrescue.org
Status: "Time Out" after processing url "http://fourriversbirdrescue.org"


trelexa.org
Status: "Time Out" after processing url "http://trelexa.org"


webstersoccer.com


2018-07-15 03:55:14,885 shop_crawler INFO     processing state: new
2018-07-15 03:55:14,885 shop_crawler INFO     processing state: new
2018-07-15 03:55:14,895 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:55:14,895 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:55:22,798 shop_crawler DEBUG    Exception during checking element Traceback (most recent call last):
  File "/home/jack/working_directory/David/trace_automation/crawling/selenium_helper.py", line 29, in can_click
    return element.is_enabled() and element.is_displayed()
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 159, in is_enabled
    return self._execute(Command.IS_ELEMENT_ENABLED)['value']
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 628, in

2018-07-15 03:55:34,459 shop_crawler DEBUG    Exception during checking element Traceback (most recent call last):
  File "/home/jack/working_directory/David/trace_automation/crawling/selenium_helper.py", line 29, in can_click
    return element.is_enabled() and element.is_displayed()
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 159, in is_enabled
    return self._execute(Command.IS_ELEMENT_ENABLED)['value']
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 628, in _execute
    return self._parent.execute(command, params)
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 320, in execute
    self.error_handler.check_response(response)
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-

2018-07-15 03:55:41,993 shop_crawler DEBUG    Exception during checking element Traceback (most recent call last):
  File "/home/jack/working_directory/David/trace_automation/crawling/selenium_helper.py", line 29, in can_click
    return element.is_enabled() and element.is_displayed()
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 159, in is_enabled
    return self._execute(Command.IS_ELEMENT_ENABLED)['value']
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 628, in _execute
    return self._parent.execute(command, params)
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 320, in execute
    self.error_handler.check_response(response)
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-

Status: "Processing Finished at State" after processing url "webstersoccer.com"


tampahumidor.com


2018-07-15 03:56:03,401 shop_crawler INFO     processing state: new
2018-07-15 03:56:03,401 shop_crawler INFO     processing state: new
2018-07-15 03:56:03,431 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:56:03,431 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:56:08,778 shop_crawler INFO     new_state new, url https://www.seriouscigars.com/info/Tampa-Humidor?utm_source=tampahumidor/
2018-07-15 03:56:08,778 shop_crawler INFO     new_state new, url https://www.seriouscigars.com/info/Tampa-Humidor?utm_source=tampahumidor/
2018-07-15 03:56:08,784 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:56:08,784 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:56:16,144 shop_crawler INFO     new_state product_page, url https://www.seriouscigars.com/info/Tampa-Humidor?utm_source=tampahumid

Status: "Processing Finished at State" after processing url "tampahumidor.com"


gotbloody.com
Status: "Time Out" after processing url "http://gotbloody.com"


personalcarenow.com


2018-07-15 03:57:31,399 shop_crawler INFO     processing state: new
2018-07-15 03:57:31,399 shop_crawler INFO     processing state: new
2018-07-15 03:57:31,402 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:57:31,402 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:57:34,053 shop_crawler INFO     new_state new, url http://personalcarenow.com/
2018-07-15 03:57:34,053 shop_crawler INFO     new_state new, url http://personalcarenow.com/
2018-07-15 03:57:34,062 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:57:34,062 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:57:39,732 shop_crawler INFO     new_state product_page, url http://www.personalcarenow.com/index.php?main_page=product_info&cPath=27&products_id=128&zenid=2a7157c1342238c5113114e4a46a3bab
2018-07-15 03:57:39,732 shop_crawl

Status: "Processing Finished at State" after processing url "personalcarenow.com"


ladyfatemahtrust.org
Status: "Not Available" after processing url "http://ladyfatemahtrust.org"


the420line.com


2018-07-15 03:58:27,719 shop_crawler INFO     processing state: new
2018-07-15 03:58:27,719 shop_crawler INFO     processing state: new
2018-07-15 03:58:27,722 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:58:27,722 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:58:31,947 shop_crawler DEBUG    clicking element: <a href="/?add-to-cart=272" data-quantity="1" class="button product_type_simple add_to_cart_button ajax_add_to_cart" data-product_id="272" data-product_sku="LBV420SC-1" aria-label="Add “The &quot;420&quot; Line - Shop Startup Kit” to your cart" rel="nofollow">Add to cart</a>
2018-07-15 03:58:31,947 shop_crawler DEBUG    clicking element: <a href="/?add-to-cart=272" data-quantity="1" class="button product_type_simple add_to_cart_button ajax_add_to_cart" data-product_id="272" data-product_sku="LBV420SC-1" aria-label="Add “The &quot;420&quot; Line - Shop Startup Kit” to your cart" rel="

2018-07-15 03:59:47,013 shop_crawler DEBUG    Exception during checking element Traceback (most recent call last):
  File "/home/jack/working_directory/David/trace_automation/crawling/selenium_helper.py", line 29, in can_click
    return element.is_enabled() and element.is_displayed()
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 159, in is_enabled
    return self._execute(Command.IS_ELEMENT_ENABLED)['value']
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webelement.py", line 628, in _execute
    return self._parent.execute(command, params)
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 320, in execute
    self.error_handler.check_response(response)
  File "/home/jack/working_directory/David/trace_automation/venv/lib/python3.5/site-

Status: "Processing Finished at State" after processing url "the420line.com"


allsquared.com


2018-07-15 03:59:56,486 shop_crawler INFO     processing state: new
2018-07-15 03:59:56,486 shop_crawler INFO     processing state: new
2018-07-15 03:59:56,491 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:59:56,491 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 03:59:57,718 shop_crawler INFO     new_state new, url http://allsquared.com/
2018-07-15 03:59:57,718 shop_crawler INFO     new_state new, url http://allsquared.com/
2018-07-15 03:59:57,720 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 03:59:57,720 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 04:00:01,310 shop_crawler INFO     new_state new, url http://allsquared.com/
2018-07-15 04:00:01,310 shop_crawler INFO     new_state new, url http://allsquared.com/
2018-07-15 04:00:01,313 shop_crawler INFO     handler <common_actors.T

Status: "Processing Finished at State" after processing url "allsquared.com"


artistsforhumanrights.org


2018-07-15 04:00:16,650 shop_crawler INFO     processing state: new
2018-07-15 04:00:16,650 shop_crawler INFO     processing state: new
2018-07-15 04:00:16,663 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 04:00:16,663 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 04:00:19,857 shop_crawler INFO     new_state new, url http://artistsforhumanrights.org/
2018-07-15 04:00:19,857 shop_crawler INFO     new_state new, url http://artistsforhumanrights.org/
2018-07-15 04:00:19,860 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 04:00:19,860 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 04:00:23,272 shop_crawler INFO     new_state new, url http://artistsforhumanrights.org/
2018-07-15 04:00:23,272 shop_crawler INFO     new_state new, url http://artistsforhumanrights.org/
2018-07-15 04:00:23,281 sh

Status: "Processing Finished at State" after processing url "artistsforhumanrights.org"


streaming.co.uk


2018-07-15 04:00:56,833 shop_crawler INFO     found 4 frames
2018-07-15 04:00:56,833 shop_crawler INFO     found 4 frames
2018-07-15 04:00:56,836 shop_crawler INFO     processing state: new
2018-07-15 04:00:56,836 shop_crawler INFO     processing state: new
2018-07-15 04:00:56,839 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 04:00:56,839 shop_crawler INFO     handler <common_actors.AddToCart object at 0x7fddce0b2cc0>
2018-07-15 04:01:04,413 shop_crawler INFO     new_state new, url https://www.medial.com/
2018-07-15 04:01:04,413 shop_crawler INFO     new_state new, url https://www.medial.com/
2018-07-15 04:01:04,422 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 04:01:04,422 shop_crawler INFO     handler <common_actors.GoogleForProductPage object at 0x7fddce0b2cf8>
2018-07-15 04:01:08,058 shop_crawler INFO     new_state new, url https://www.medial.com/
2018-07-15 04:01:08,058 shop_craw

Status: "Processing Finished at State" after processing url "streaming.co.uk"


parkermotherandchild.org


ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
states = {}
for status in results:
    if isinstance(status, ProcessingStatus):
        states[status.state] = states.get(status.state, 0) + 1
        
print(states)

### Test Empty cart processing

In [None]:
# Make it True if you want to test empty cart detection
test_empty_cart = False

checkout_pages = [
    'http://vape-fuel.com/checkout/cart/',
    'https://store-bb7f1.mybigcommerce.com/cart.php',
    'https://www.firstfitness.com/index.php?pg=signup',
    'https://store.sandlakedermatology.com/checkout/cart/',
    'https://www.dixieems.com/cart.asp',
    'http://www.jonessurgical.com/quick-order'    
]

if test_empty_cart:
    driver = get_driver()
    
    for url in checkout_pages:
        driver.get(url)
        time.sleep(30)
        
        is_empty_cart = common_actors.is_empty_cart(driver)
        print('url: {}, is empty cart: {}'.format(url, is_empty_cart))
        assert is_empty_cart

    driver.quit()

In [None]:
test_broken_sites = False

not_available = [
    'juicyliquid.com',
    'gotbloody.com',
    'trelexa.org'
]

error = [
    'tampahumidor.com',
    'seriouscigars.com'    
]

timeout = [
    'moneynetint.com'
]

domains_for_sale = [
    'walkinmycloset.com'
]
        

if test_broken_sites:
    with get_crawler() as crawler:
        for url in domains_for_sale:
            print(url)
            status = crawler.crawl(url)
            assert isinstance(status, NotAvailable), '{} must be not available'.format(url)
        
        for url in timeout:
            print(url)
            status = crawler.crawl(url)
            assert isinstance(status, Timeout) or isinstance(status, NotAvailable), \
                '{} must be not timed out'.format(url)

        for url in not_available:
            print(url)
            status = crawler.crawl(url)
            assert isinstance(status, NotAvailable), '{} must be not available'.format(url)

        for url in error:
            print(url)
            status = crawler.crawl(url)
            assert isinstance(status, RequestError), '{} must produce an error'.format(url)
        
        