In [1]:
# start scraping information from just one page

from urllib.request import urlopen
from bs4 import BeautifulSoup

url='https://baltimore.craigslist.org/search/cta?s=0'

html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
cars=bs.find_all('li',{ 'class':'result-row'})

scrapedCarsList=[]
for car in cars:
    salesTitle=car.find('a',{'class':'result-title hdrlnk'})
    price=car.find('span',{'class':'result-price'})
    postingDate=car.find('time',{'class':'result-date'})
    #Some listings do not have a price.
    if price!=None:
        new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
        #print(new_car) #uncomment to see all the cars with a newline
        scrapedCarsList.append(new_car)
print(scrapedCarsList[0:3]) #uncomment to see the list of cars on the first page
len(scrapedCarsList)

[['02 Chevy Tahoe 4X4', 'Sep 14', '$2250'], ['2008 Nissan Rogue SL SUV AWD - Needs Transmission - $2,200 OBO', 'Sep 14', '$2200'], ['08 Saturn astra', 'Sep 14', '$1500']]


120

In [2]:
# now let's revise the code to write the results of the first page into a csv file named 'CarCraglist.csv'.

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

with open('CarCraglist.csv', 'w',newline='') as myFile:
    writer = csv.writer(myFile)
    writer.writerow(["sales Title", "Listing Date", "Price"])

url='https://baltimore.craigslist.org/search/cta?s=0'
html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
cars=bs.find_all('li',{ 'class':'result-row'})

scrapedCarsList=[]
for car in cars:
    salesTitle=car.find('a',{'class':'result-title hdrlnk'})
    price=car.find('span',{'class':'result-price'})
    postingDate=car.find('time',{'class':'result-date'})
    #Some listings do not have a price.
    if price!=None:
        new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
        scrapedCarsList.append(new_car)

with open('CarCraglist.csv', 'a',newline='',encoding='utf-8') as myFile:
    writer = csv.writer(myFile)
    writer.writerows(scrapedCarsList)

In [3]:
#  create the list of URL's for the most recent 1,200 posting

baseURL='https://baltimore.craigslist.org/search/cta?s='
urlList=[]
for i in range(0,1201,120):
    newURL=baseURL+str(i)
    urlList.append(newURL)

print(urlList[0:50]) #uncomment to see the urls
len(urlList)

['https://baltimore.craigslist.org/search/cta?s=0', 'https://baltimore.craigslist.org/search/cta?s=120', 'https://baltimore.craigslist.org/search/cta?s=240', 'https://baltimore.craigslist.org/search/cta?s=360', 'https://baltimore.craigslist.org/search/cta?s=480', 'https://baltimore.craigslist.org/search/cta?s=600', 'https://baltimore.craigslist.org/search/cta?s=720', 'https://baltimore.craigslist.org/search/cta?s=840', 'https://baltimore.craigslist.org/search/cta?s=960', 'https://baltimore.craigslist.org/search/cta?s=1080', 'https://baltimore.craigslist.org/search/cta?s=1200']


11

In [4]:
#  trun the scraping script into a function so that it can  takes the page number (0, 120, 240, ...) as input and returns a list of all the cars on the page in a list of lists format.

def craigslistCarsScrape(pageNumber):
    print('*** Scraping cars on page:',int(pageNumber/120+1),'***\n\n')

    baseURL='https://baltimore.craigslist.org/search/cta?s='
    url=baseURL+str(pageNumber)
    html = urlopen(url)
    bs = BeautifulSoup(html.read(),'html.parser')
    cars=bs.find_all('li',{ 'class':'result-row'})
    scrapedCarsList=[]            
    for car in cars:
        salesTitle=car.find('a',{'class':'result-title hdrlnk'})
        price=car.find('span',{'class':'result-price'})
        postingDate=car.find('time',{'class':'result-date'})
        #Some listings do not have a price.
        if price!=None:
            new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
            scrapedCarsList.append(new_car)
    return scrapedCarsList

In [5]:
# error handling to make the codes more robust

from urllib.error import HTTPError
from urllib.error import URLError

def craigslistCarsScraper(pageNumber):
    print('*** Scraping cars on page:',int(pageNumber/120+1),'***\n\n')

    baseURL='https://baltimore.craigslist.org/search/cta?s='
    url=baseURL+str(pageNumber)
    
    try:
        
        html = urlopen(url)
    
    except HTTPError as e:
        print(e)
        print('-----------------------HTTPError----------------------')
        return None
    except URLError as e:
        print('Server cound not be found')
        print('-----------------------URLError----------------------')
        return None
    
    bs = BeautifulSoup(html.read(),'html.parser')
    
    try:
        
        cars=bs.find_all('li',{ 'class':'result-row'})
    
    except AttributeError as e:
        print('Tag was not found')
        print('-----------------------AttributeError----------------------')
    
    else:
        scrapedCarsList=[]
        for car in cars:
            salesTitle=car.find('a',{'class':'result-title hdrlnk'})
            price=car.find('span',{'class':'result-price'})
            postingDate=car.find('time',{'class':'result-date'})
            #Some listings do not have a price.
            if price!=None:
                new_car=[salesTitle.get_text(),postingDate.get_text(),price.get_text()]
                    
                scrapedCarsList.append(new_car)
               
        return scrapedCarsList

In [6]:
craigslistCarsScraper(600)

*** Scraping cars on page: 6 ***




[['2004 Nissan Frontier x cab', 'Sep 12', '$5950'],
 ['1995 Chevy 7500 Cat 3116 Diesel', 'Sep 12', '$5950'],
 ['2008 Toyota Scion Tc- Nice!!', 'Sep 12', '$5000'],
 ['2016 Jeep Cherokee 4x4 4WD SUV ACEPTAMOS TAX ID! SE HABLA ESPANOL!',
  'Sep 12',
  '$0'],
 ['2005 grand cherokee as is', 'Sep 12', '$4000'],
 ['Md state inspected 2010 Honda Accord EXL loaded $6800 obo',
  'Sep 12',
  '$6800'],
 ['Chevrolet step side', 'Sep 12', '$16500'],
 ['1989 Chevy Corvette', 'Sep 12', '$7200'],
 ['2012 Kia Sorento. MD inspected', 'Sep 12', '$6900'],
 ['2008 Dodge Grand Caravan SXT:  1 Owner, MD Insp, 84k mi',
  'Sep 11',
  '$8995'],
 ['2004 Chevrolet Silverado Ext Cab 4WD:  MD Inspected, 143k mi',
  'Sep 11',
  '$11995'],
 ['2008 GMC Yukon XL Denali AWD:  Local Trade, Like New, $13995',
  'Sep 11',
  '$13995'],
 ['!!!! 2009 HYUNDAI SONATA 4DR 4CYL AUTO RUNS100% $3950 !!!',
  'Sep 11',
  '$3950'],
 ['2008 Toyota Sienna', 'Sep 11', '$6000'],
 ['2004 Nissan Maxima 85k miles and state inspected', 'Sep 11

In [7]:
# run the function in a loop and write the resutls on a csv

with open('craigslist_cars_final.csv', 'w',newline='') as myFile:
    writer = csv.writer(myFile)
    writer.writerow(["Listing Title", "Listing Date", "Price"])

with open('craigslist_cars_final.csv', 'a',newline='',encoding='utf-8') as myFile:
    writer = csv.writer(myFile)
    for i in range(0,1201,120):
        scrapedCarsList=craigslistCarsScraper(i)
        writer.writerows(scrapedCarsList)

print('----------------------------------------Well done---------------------------------------------- ')
print('-----------------------------------Scraping completed------------------------------------------ ')
print('------------Please find the csv file in the folder where this scraping file exists------------- ')

*** Scraping cars on page: 1 ***


*** Scraping cars on page: 2 ***


*** Scraping cars on page: 3 ***


*** Scraping cars on page: 4 ***


*** Scraping cars on page: 5 ***


*** Scraping cars on page: 6 ***


*** Scraping cars on page: 7 ***


*** Scraping cars on page: 8 ***


*** Scraping cars on page: 9 ***


*** Scraping cars on page: 10 ***


*** Scraping cars on page: 11 ***


----------------------------------------Well done---------------------------------------------- 
-----------------------------------Scraping completed------------------------------------------ 
------------Please find the csv file in the folder where this scraping file exists------------- 
