Scrape autotrader for listings

In [38]:
import requests
from bs4 import BeautifulSoup
import re
import datetime
import time
import json_fix

listings = []

class listing:
    def __init__(self, url):
        self.url = url
        self.id = self.parse_listingid(url)

    def parse_listingid(self, listing_url): 
        pattern = r'(?<=listingId=)\d{9}'
        match = re.search(pattern, listing_url)
        if match:
            return match.group()

    def parse_content(self):
        try:
            time.sleep(2.5) # to reduce timeout errors when running more than one instance
            print("Trying " + self.url)
            print("Loading page content..")
            self.response = requests.get(self.url)
            print("Parsing HTML...")
            self.content = BeautifulSoup(self.response.content, 'html.parser')
            #store page temporarily
            #self.content = content

            print("Parsing content...")
            #define regex to get the element that has all the data in it
            self.pattern = re.compile('.*vin.*', re.IGNORECASE)
            #load the element
            self.matches = self.content.find_all(text=self.pattern)

            #use string manipulation to pull data from the element
            
            #get price
            try:
                self.price = repr(self.matches[0]).split('"price":')[1].split(',')[0].replace('"','')
            except:
                self.price = "Missing"

            #get vin
            try:
                self.vin = repr(self.matches[0]).split('"vin":')[1].split(',')[0].replace('"','')
            except:
                self.vin = "Missing"
            
            #get odometer
            try:
                self.odometer = repr(self.matches[0]).split('"odometer":')[1].split('"')[1].replace('"','').replace(" mi","").replace(",","")
            except:
                self.odometer = "Missing"
            
            #get trim
            try:
                self.trim = repr(self.matches[0]).split('"trim":')[1].split('"')[1].replace('"','')
            except:
                self.trim = "Missing"

            #get year
            try:
                self.year = repr(self.matches[0]).split('"car_year":')[1].split(',')[0]
            except:
                self.year = "Missing"
            
            #calculate age
            try:
                self.age = int((datetime.datetime.now()).year) - int(self.year)
            except:
                self.age = None

            #get make
            try:
                self.make = repr(self.matches[0]).split('"makeName":')[1].split('"')[1].replace('"','').replace('[','')
            except:
                self.make = "Missing"
            
            #get model
            try:
                self.model = repr(self.matches[0]).split('"modelName":')[1].split('"')[1].replace('"','').replace('[','')
            except:
                self.model = "Missing"
            
            #get sale type
            try: 
                self.saletype = repr(self.matches[0]).split('"Car_Type":')[1].split('"')[1].replace('"','')
            except:
                self.saletype = "Missing"
            
            #get mpg
            try:
                self.mpg = repr(self.matches[0]).split('"fuelEconomy":[')[1].split(']')[0].replace('"','')
            except:
                self.mpg = "Missing"
            
            #get vehicle features
            self.features = []
            try:
                features = repr(self.matches[0]).split('"vehicleFeatures":[')[1].split(']')[0].replace('"','').split(",")
                for feature in features:
                    self.features.append(feature)
            except:
                self.features = None
        except ConnectionError:
            return "Lost Connection, skipping.."

def get_page_listings(pageurl):
    # Send a GET request to the URL
    response = requests.get(pageurl)

    # Check the status code of the response to make sure it was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using Beautiful Soup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the elements on the page that you want to scrape
        # For example, to get the text of all the links on the page:
        links = soup.find_all('a')
        for link in links:
            if link.has_attr('href') and 'vehicledetails' in link.get('href'):
                fullurl = originurl + link.get('href')
                new_listing = listing(fullurl)
                if 'clickType=spotlight' not in new_listing.url:
                    listing_ids = [listing.id for listing in listings]
                    if new_listing.id not in listing_ids:
                        listings.append(new_listing)
    else:
        print('Error: Failed to retrieve content')


originurl = 'http://www.autotrader.com/'
#baseurl = 'http://www.autotrader.com/cars-for-sale/all-cars/ford/f150/dallas-tx-75201?searchRadius=50&isNewSearch=false&marketExtension=include&showAccelerateBanner=false&sortBy=relevance&numRecords=100'
#baseurl = 'http://www.autotrader.com/cars-for-sale/toyota/camry/dallas-tx-75201?requestId=3P_CERT&listingTypes=CERTIFIED%2CUSED%2C3P_CERT&searchRadius=50&marketExtension=include&isNewSearch=true&showAccelerateBanner=false&sortBy=relevance&numRecords=100'
#baseurl = 'http://www.autotrader.com/cars-for-sale/nissan/altima/dallas-tx-75201?listingTypes=CERTIFIED%2CUSED&searchRadius=50&marketExtension=include&isNewSearch=false&showAccelerateBanner=false&sortBy=relevance&numRecords=100'
baseurl = 'http://www.autotrader.com/cars-for-sale/honda/accord/dallas-tx-75201?searchRadius=50&marketExtension=include&isNewSearch=false&showAccelerateBanner=false&sortBy=relevance&numRecords=100'
pages = 6

In [39]:
#get links from each page of results
#1000 seems to be the limit
for i in range(pages + 1):
    if i == 0:
        get_page_listings(baseurl)
    if i >= 0:
        n = str(i*100)
        newurl = baseurl + "&firstRecord=" + n
        get_page_listings(newurl)
    i = i+1

#for l in listings:
    #print('URL:', l.url)
    #print('ID:', l.id)

In [40]:
import json
filename = "output.json"
listObj = []

# define a custom encoder function to handle the features list
def encode_features(obj):
    if isinstance(obj, list):
        return [str(item) for item in obj]
    else:
        return obj

def write_to_file(listing_dict):
    with open(filename, 'a') as f:
        json.dump(listing_dict, f, default=encode_features)
        f.write("\n")


for listing in listings:
    try:
        try:
            print(listing.vin + " already parsed")
            continue
        except AttributeError:
            try:
                listing.parse_content()
            except:
                print("Connection Rejected, Skipping")
                continue

        if listing.vin:
            # Create object that json serializer can comprehend
            listing_dict = {
                "url": listing.url,
                "id": listing.id,
                "price": listing.price,
                "vin": listing.vin,
                "odometer": listing.odometer,
                "trim": listing.trim,
                "year": listing.year,
                "age": listing.age,
                "make": listing.make,
                "model": listing.model,
                "saletype": listing.saletype,
                "mpg": listing.mpg,
                "features": listing.features
            }
            print("Writing listing for " + listing.vin + " to file.")
            write_to_file(listing_dict)    
    except ConnectionError:
        print("Connection Rejected, Waiting to retry..")
        time.sleep(240)
        continue


Trying http://www.autotrader.com//cars-for-sale/vehicledetails.xhtml?listingId=667544495&makeCodeList=HONDA&modelCodeList=ACCORD&city=Dallas&state=TX&zip=75201&searchRadius=50&marketExtension=include&isNewSearch=false&showAccelerateBanner=false&sortBy=relevance&numRecords=100&listingTypes=USED&referrer=%2Fcars-for-sale%2Fhonda%2Faccord%2Fdallas-tx-75201%3FsearchRadius%3D50%26marketExtension%3Dinclude%26isNewSearch%3Dfalse%26showAccelerateBanner%3Dfalse%26sortBy%3Drelevance%26numRecords%3D100&clickType=listing
Loading page content..
Parsing HTML...
Parsing content...


  self.matches = self.content.find_all(text=self.pattern)


Writing listing for 1HGCV3F91LA015984 to file.
Trying http://www.autotrader.com//cars-for-sale/vehicledetails.xhtml?listingId=673767724&makeCodeList=HONDA&modelCodeList=ACCORD&city=Dallas&state=TX&zip=75201&searchRadius=50&marketExtension=include&isNewSearch=false&showAccelerateBanner=false&sortBy=relevance&numRecords=100&listingTypes=USED&referrer=%2Fcars-for-sale%2Fhonda%2Faccord%2Fdallas-tx-75201%3FsearchRadius%3D50%26marketExtension%3Dinclude%26isNewSearch%3Dfalse%26showAccelerateBanner%3Dfalse%26sortBy%3Drelevance%26numRecords%3D100&clickType=listing
Loading page content..
Parsing HTML...
Parsing content...
Writing listing for 1HGCV1F36JA039324 to file.
Trying http://www.autotrader.com//cars-for-sale/vehicledetails.xhtml?listingId=672968554&makeCodeList=HONDA&modelCodeList=ACCORD&city=Dallas&state=TX&zip=75201&searchRadius=50&marketExtension=include&isNewSearch=false&showAccelerateBanner=false&sortBy=relevance&numRecords=100&listingTypes=USED&referrer=%2Fcars-for-sale%2Fhonda%2Fac