In [None]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from selenium.common.exceptions import ElementNotVisibleException
from bs4 import BeautifulSoup
import requests
import pandas as pd

### If scraping all of California, pull sites and regions from Craigslist

In [None]:
ca_links_url = 'https://geo.craigslist.org/iso/us/ca'

In [None]:
ca_links_page = requests.get(ca_links_url)

In [None]:
soup = BeautifulSoup(ca_links_page.text, 'html.parser')

In [None]:
geo_site_list = soup.find('ul', class_='geo-site-list')

In [None]:
lis = geo_site_list.find_all('li')

In [None]:
links = [li.find('a')['href'] for li in lis]

In [None]:
links

In [None]:
len(links)

In [None]:
# Extract regions from links
import re
regions = [re.findall(r"[\w']+", link)[1] for link in links]

In [None]:
regions

In [None]:
len(regions)

### Only scraping four regions, though, so hard-code those in

In [None]:
links = ['https://bakersfield.craigslist.org','https://sacramento.craigslist.org','https://redding.craigslist.org',
         'https://sandiego.craigslist.org']

In [None]:
regions = ['bakersfield','sacramento','redding','sandiego']

### Hard-code category codes in

In [None]:
cats = ['ata','ara','sna','pta','ava','baa','haa','bip','bia','bpa','boo','bka','ema','moa','cla','syp','sya','fua',
        'hva','jwa','mpa','mca','msa','pha','rva','tla','taa','tra','vga']

In [None]:
import traceback

In [None]:
from time import sleep
from random import randint
from warnings import warn

### Set headers

In [None]:
headers = requests.utils.default_headers()
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'

In [None]:
headers

### Account for possible dropped connections

In [None]:
# Getting constant dropped connections
# Solution from this site: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 504),
    session=None,
):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

### Main scraping code

In [None]:
rows = []
# Iterate through links and regions simultaneously
for base_url, region in zip(links, regions):
    print(f'Region: {region}')
    # Iterate through categories
    for cat in cats:
        print(f'\tCategory: {cat}')
        this_results_page_num = 0
        counter = 0
        visit_next = True
        # Iterate through results pages
        while visit_next:
            print(f'\t\tThis results page num: {this_results_page_num}')
            if this_results_page_num > 0:
                next_results_page_link = base_url + '/search/' + cat + '?s=' + str(this_results_page_num)
            else:
                next_results_page_link = base_url + '/search/' + cat
            response = requests_retry_session().get(next_results_page_link, headers = headers)
            results_page = BeautifulSoup(response.text, 'html.parser')
            results = results_page.find_all('li', class_ = 'result-row')
            if not results:
                print('\t\tNo results!')
                visit_next = False
                continue
            result_anchors = [result.find('a', class_ = 'result-title') for result in results]
            result_links = [anchor.get('href') for anchor in result_anchors]
            result_ids = [result.get('data-pid') for result in results]
            if response.status_code != 200:
                warn('\t\tRequest: {}; Status code: {}'.format(requests, response.status_code))
                visit_next = False
                continue
            # Iterate through results in results page
            for link in result_links:
                post_id = re.findall(r"/([0-9]+)\.html", link)[0]
                this_row = {}
                counter += 1
                # Sleep between 1/20 and 1/4 of a second
                sleep(randint(1,5)/20)
                response = requests_retry_session().get(link, headers = headers)
                if response.status_code != 200:
                    warn('\t\t\tRequest: {}; Status code: {}'.format(requests, response.status_code))
                    continue
                result = BeautifulSoup(response.text, 'html.parser')
                crumb = result.find('li', class_ = 'crumb category')
                if crumb:
                    category_anchor = crumb.find('a')
                    if category_anchor:
                        category = category_anchor.text
                title_span = result.find('span', id='titletextonly')
                if title_span:
                    title = title_span.text
                price_span = result.find('span', class_='price')
                if price_span:
                    price = price_span.text
                # Get all sidebar attributes for each posting (if they exist)
                these_uncat_attrs = []
                these_attrs = {}
                attr_para = result.find_all('p', class_='attrgroup')
                if attr_para:
                    for para in attr_para:
                        attr_spans = para.find_all('span')
                        for span in attr_spans:
                            if not ':' in span.text:
                                b = span.find('b')
                                if b:
                                    these_uncat_attrs.append(b.text)
                            else:
                                key = span.text.split(':')[0]
                                value = span.find('b').text
                                these_attrs[key] = value
                # Get location data from map
                loc_div = result.find('div', id='map')
                if loc_div: 
                    lat = loc_div.get('data-latitude')
                    lon = loc_div.get('data-longitude')
                else:
                    lat = None
                    lon = None
                # Assign each piece of data to its key in the dictionary
                if post_id:
                    this_row['post_id'] = post_id
                if category:
                    this_row['category'] = category
                if title:
                    this_row['title'] = title
                if price:
                    this_row['price'] = price
                this_row['uncat_attrs'] = these_uncat_attrs
                this_row['attrs'] = these_attrs
                this_row['latitude'] = lat
                this_row['longitude'] = lon
                this_row['region'] = region
                this_row['url'] = link
                # Append dictionary to 'rows' variable
                rows.append(this_row)
                print(f'\t\t\t{counter}')
                print('\t\t\tDidn\'t pass!')
                print('\t\t\tWrote data!')
            this_results_page_num += 120

In [None]:
data = pd.DataFrame(rows)

In [None]:
# Extract condition (if exists) from 'attrs' dictionary column
data['condition'] = data['attrs'].map(lambda x: x.get('condition'))

In [None]:
data.to_csv('Data/all_data.csv', index = False)