In [2]:
from scrapfly import ScrapflyClient, ScrapeConfig
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY")

In [4]:
result = scrapfly.scrape(
    ScrapeConfig(
        url="http://httpbin.org/html",
    )
)
result.scrape_result

{'iframes': [],
 'browser_data': {'session_storage_data': None,
  'javascript_evaluation_result': None,
  'websockets': [],
  'xhr_call': None,
  'local_storage_data': None},
 'log_url': 'https://scrapfly.io/dashboard/monitoring/log/01GSMC76WZH7PEV22D4JB2GV0C',
 'format': 'text',
 'reason': 'OK',
 'dns': None,
 'content': "<!DOCTYPE html>\n<html>\n  <head>\n  </head>\n  <body>\n      <h1>Herman Melville - Moby-Dick</h1>\n\n      <div>\n        <p>\n          Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, o

In [6]:
# parse result
# css selectors
result.selector.css("h1::text").get()

'Herman Melville - Moby-Dick'

In [7]:
result.selector.xpath("//h1/text()").get()

'Herman Melville - Moby-Dick'

In [5]:
# tip: use cache and debug when developing:
result_cache = scrapfly.scrape(ScrapeConfig(
    url="https://scrapfly.io/blog",

    cache=True, 
    cache_ttl=60, # set cache expiration time (seconds), default is 1 day 

    debug=True,  # stores result in dashboard for 
))
result_cache.scrape_result['log_url']

'https://scrapfly.io/dashboard/monitoring/log/01GSYBAE4CV9RVD19110SW249F'

In [9]:
# common HTTP options:
result_basics = scrapfly.scrape(ScrapeConfig(
    url="http://httpbin.org/post",
    method="POST",
    data={"query": "hello world"},
    headers={
        "X-CSRF-Token": "1234",
    },
    cookies={
        "language": "en"
    }
))
print(result_basics.content)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "query": "hello world"
  }, 
  "headers": {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate", 
    "Accept-Language": "es-MX,es;q=0.9,en-US;q=0.8,en;q=0.7", 
    "Content-Length": "17", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Cookie": "language=en", 
    "Host": "httpbin.org", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Windows; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5396.2 Safari/537.36", 
    "X-Amzn-Trace-Id": "Root=1-63f48c3d-10916945356d612e5345e609", 
    "X-Csrf-Token": "1234"
  }, 
  "json": null, 
  "origin": "103.250.82.11", 
  "url": "http://httpbin.org/post"
}



In [10]:
# Anti Scraping protection Bypass
result_crunchbase = scrapfly.scrape(ScrapeConfig(
    url="https://www.crunchbase.com/organization/tesla-motors",
    asp=True
))
result_crunchbase.selector.css(".description ::text").get()

'Tesla Motors specializes in developing a full range of electric vehicles.'

In [12]:
# proxy configuration
result_yelp = scrapfly.scrape(ScrapeConfig(
    url="https://yelp.com",
    country="US",
    proxy_pool="public_residential_pool",
))
result_yelp.selector.css("title").get()

'<title data-rh="true">Restaurants, Dentists, Bars, Beauty Salons, Doctors - Yelp</title>'

In [15]:
# Headless Web Browsers
result_twitter = scrapfly.scrape(ScrapeConfig(
    url="https://twitter.com/Scrapfly_dev/status/1625137012776001538",
    country="US",
    render_js=True,
    screenshots={"screenshotname-example": "fullpage"},
    wait_for_selector="h1",
))
print(result_twitter.selector.xpath("//title").getall())
print(result_twitter.scrape_result['screenshots'])
result_twitter.scrape_result['log_url']

['<title>Scrapfly on Twitter: "Did you know that you can scrape #Twitter for free using #Python? \nTwitter is disabling free API access but that shouldn\'t stop you from getting that public data for your projects!\n\nWe cover 2 ways to scrape Twitter with Python:\nhttps://t.co/2kiOGlfKPY\n\n#webscraping" / Twitter</title>']
{'screenshotname-example': {'url': 'https://api.scrapfly.io/scrape/screenshot/01GSSMM00R6SJM1N4AZNWZWER9/screenshotname-example', 'extension': 'jpg', 'format': 'fullpage', 'css_selector': None, 'size': 51906}}


'https://scrapfly.io/dashboard/monitoring/log/01GSSMM00R6SJM1N4AZNWZWER9'

In [4]:
# ⚠️ advanced browser controls:
result_advanced = scrapfly.scrape(ScrapeConfig(
    url="https://www.yelp.com/",
    render_js=True,
    country="US",

    # evaluate any javascript
    js="return document.querySelector('title').innerText",  # run javascript
    # control browser via commands
    js_scenario=[
        # wait for search button to load:
        {"wait_for_selector": {"selector": "#search_description"}},
        # enter search text: "tacos"
        {"fill": {"selector": "#search_description", "value": "tacos"}},
        # click search button:
        {"click": {"selector": "button[type=submit]"}},
        # wait for results to load
        {"wait_for_selector": {"selector": "h3"}},
    ],
))
# get 
result_advanced.scrape_result['browser_data']['javascript_evaluation_result']

'Restaurants, Dentists, Bars, Beauty Salons, Doctors - Yelp'