In [57]:
import requests
import bs4
import json
from time import sleep
from random import randint
import csv

def parse_main_page(location='washingtondc', pages=1):

    base_url = '.craigslist.org/search/apa'
    page_url = '?s='
    headers = {'User-agent':'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}
    
    main_results = []

    try:
        for n in range(1, pages+1):

            #random time delay for scraping
            sleep(randint(0,2))

            url = ''.join(['http://', location, base_url, page_url, str(n)])

            print('Processing main page %s of %s %s' % (str(n), str(pages), url))
                        
            r = requests.get(url, headers=headers)
            #print(r.status_code)
            
            # get listing information from div
            soup = bs4.BeautifulSoup(r.text, "lxml")
            listings = soup.find("div", {"class" : "content"}).find_all( "p", {"class" : "row"})
            
            #    <p class="row"
            #        data-pid="5551948979">
            #       <a href="/doc/apa/5551948979.html"
            #          class="i"
            #          data-ids=
            #          "1:00p0p_HivHZ10KR8,1:00K0K_h7JVwyiRbKO,1:00h0h_aKUUkULg1Vj,1:00C0C_kuO08O1oF9J,1:00f0f_gstUq0cYY7K,1:00W0W_aKv6LZZcsns,1:00o0o_elJBTqG91kZ,1:00808_kRXn5TIdQXh,1:00606_97sSZO7EEMt,1:00x0x_42y2gBYTv5f,1:00h0h_izI9CW1W6r6,1:00O0O_cz9jTJHKe6P,1:01414_8iGEqIx4PNM,1:00j0j_39oRWXiCYLe,1:00Q0Q_lBvfBEkanpI,1:01616_b5jurKtL2BR,1:00R0R_6vbzZDXvoH5,1:00J0J_gMveNsnijJI">
            #       </a> <span class="txt"> <span class="pl">Apr 22 <a href="/doc/apa/5551948979.html"
            #          data-id="5551948979"
            #          class="hdrlnk"><span id="titletextonly">Second Floor 1BR in
            #          Clarendon</span></a></span> <span class="l2"><span class="price">$2335</span>
            #          <span class="housing">/ 685ft<sup>2</sup> -</span> <span class=
            #          "pnr"><small>(3000 N. Washington Blvd.)</small> <span class="px"><span class=
            #          "p">pic <span class="maptag"
            #             data-pid="5551948979">map</span></span></span></span></span> <span class=
            #             "js-only banish-unbanish"> </span></span>
            #     </p>

            
            # iterate through listing data on main page
            for listing in listings:
                
                #print(listing.prettify())
                      
                _dat = {}
                _dat['index_url'] = url
                _dat['location'] = location
                _dat['page'] = n
                _dat['item_id'] = listing['data-pid']
                _dat['listing_url'] = listing.find("a")['href']
                _dat['listing_date'] = listing.find("time")['datetime']
                _dat['title'] = listing.find("span", {"id" : "titletextonly"}).text

                # some listings don't have a price
                if listing.find("span", {"class" : "price"}):
                    _dat['price'] = listing.find("span", {"class" : "price"}).text.replace('$', '').replace(',', '')
                
                
                #print(_dat['listing_url'])
                                
                # get additional information from listing page
                _dat = parse_listing_page(_dat)
                
                #print(_dat)

                main_results.append(_dat)
            
    except:
        print('This page did not return results: %s ' % url)

    print('Done processing main page')
    return main_results

def parse_listing_page(_dat):
    
    listing_base_url = '.craigslist.org'
    headers = {'User-agent':'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}

    try:
        
        #random time delay for scraping
        sleep(randint(0,2))

        url = ''.join(['http://',  _dat['location'], listing_base_url, _dat['listing_url']])

        print('Processing listing page %s' % url)

        r = requests.get(url, headers=headers)
        #print(r.status_code)

        soup = bs4.BeautifulSoup(r.text, "lxml")
                
        #listing time
        try:
            
            _dat['posted_on'] = soup.find("time", {"class" : "timeago"}).text
            
        except:
            print('  Failed getting listing time: %s' % url)

        #lat/lon
        try:
            
            _dat['lat'] = soup.find("div", {"id" : "map"})['data-latitude']
            _dat['lon'] = soup.find("div", {"id" : "map"})['data-longitude']
            
        except:
            print('  Failed getting lat/long: %s' % url)

        #neighborhood
        try:
            
            _dat['neighborhood'] = soup.find("div", {"class" : "mapaddress"}).text
            
        except:
            print('  Failed getting neighborhood: %s' % url)

        #post body
        try:
            
            _dat['post_body'] = soup.find("section", {"id" : "postingbody"}).text
            
        except:
            print('  Failed getting post body: %s' % url)

        
                

    except:
        print('Error getting listing details for: %s' % url)
    
    return _dat


if __name__ == '__main__':

    location = 'washingtondc'
    
    #get results
    listings = parse_main_page(location=location, pages=1)
    
    df = pd.DataFrame.from_dict(listings, dtype=None)
    
    print(df.head())

    #create filename
    filename = location.replace('--', '_').replace('-', '_').lower() + '.csv'
    
    #get all keys
    headings = sorted(list(set().union(*(d.keys() for d in listings))))
    
    #write to csv file
    with open(filename, 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, headings)
        dict_writer.writeheader()
        dict_writer.writerows(listings)

Processing main page 1 of 1 http://washingtondc.craigslist.org/search/apa?s=1
Processing listing page http://washingtondc.craigslist.org/nva/apa/5540494476.html
Processing listing page http://washingtondc.craigslist.org/doc/apa/5546173750.html
Processing listing page http://washingtondc.craigslist.org/mld/apa/5535079834.html
Processing listing page http://washingtondc.craigslist.org/nva/apa/5526026336.html
  Failed getting lat/long: http://washingtondc.craigslist.org/nva/apa/5526026336.html
  Failed getting neighborhood: http://washingtondc.craigslist.org/nva/apa/5526026336.html
Processing listing page http://washingtondc.craigslist.org/mld/apa/5552180360.html
  Failed getting lat/long: http://washingtondc.craigslist.org/mld/apa/5552180360.html
  Failed getting neighborhood: http://washingtondc.craigslist.org/mld/apa/5552180360.html
Processing listing page http://washingtondc.craigslist.org/nva/apa/5552187871.html
  Failed getting neighborhood: http://washingtondc.craigslist.org/nva/ap