In [1]:
import pandas as pd
import re

from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
from random import randint
from time import sleep, ctime

from config import driver, username, password, host, port, database
from sqlalchemy import create_engine

def init_browser():
    executable_path = {'executable_path': ChromeDriverManager().install()}
    return Browser('chrome', **executable_path, headless=False)

In [2]:
# Cities to search
cities = ['Los-Angeles_CA','New-York_NY', 'Chicago_IL', 'Houston_TX']
cities = ['Duarte_CA']

In [3]:
# Start browser
browser = init_browser()



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Get LATEST driver version for 93.0.4577
Trying to download new driver from https://chromedriver.storage.googleapis.com/93.0.4577.63/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\kesam\.wdm\drivers\chromedriver\win32\93.0.4577.63]


In [4]:
# Create blank lists/dictionaries to store attributes
prices = []
beds = []
baths = []
sizes = []
addresses = []
statuses = []
detail_pages = []

In [5]:
listing_indicators = {}
types = []
fees = []
pricesqfts = []
garages = []
years = []

In [None]:
## Identify listing attributes from result card

In [7]:
# Loop through each city
for city in cities:

    print(f"Searching {city}...{ctime()}\n")
    
    # Loop through each search result page
    for i in range(2, 3, 1):
        
        # Set dynamic URL
        url = f"https://www.realtor.com/realestateandhomes-search/{city}/pg-{i}"
        browser.visit(url)
        
        print(f"Scraping page {i}...{ctime()}")
        
        # HTML object
        html = browser.html
        
        # Parse HTML with Beautiful Soup
        soup = bs(html, "html.parser")

        # Identify all listings
        listings = soup.find_all('li', attrs={"data-testid": "result-card"})

        # Loop through each listing to identify attributes
        for listing in listings:          
            try:
                price = int(listing.find('span', attrs={"data-label": "pc-price"}).text.strip('$'))
                prices.append(price)
            except:
                prices.append('No Info')
            try:
                bed = int(listing.find('li', attrs={"data-label": "pc-meta-beds"}).text.strip('bed'))
                beds.append(bed)
            except:
                beds.append('No Info')
            try:
                bath = float(listing.find('li', attrs={"data-label": "pc-meta-baths"}).text.strip()[0])
                baths.append(bath)
            except:
                baths.append('No Info')
            try:
                size = listing.find('li', attrs={"data-label": "pc-meta-sqft"}).text.strip('sqft')
                sizes.append(size)
            except:
                sizes.append('No Info')
            try:
                address = listing.find('div', attrs={"data-label": "pc-address"}).text
                addresses.append(address)
            except:
                addresses.append('No Info')
            try:
                status = listing.find('span', attrs={"class": "jsx-3853574337 statusText"}).text
                statuses.append(status)
            except:
                statuses.append('No Info')

            
            # Identify URL to listing detail page
            detail_page = listing.find('a').get('href')
            
            # Append to list
            detail_pages.append(detail_page)
            
        # Generate random number between 2 to 10 seconds to wait before continuing loop
        sleep(randint(2,10))
            
    print("\n----------------------------\n")
    
print(f'Scraping complete...{ctime()}')

Searching Duarte_CA...Sun Sep 26 16:40:23 2021

Scraping page 2...Sun Sep 26 16:40:47 2021

----------------------------

Scraping complete...Sun Sep 26 16:40:56 2021


In [None]:
## Identify listing attributes from detail page

In [9]:
i = 1
num_page = len(detail_pages)
print(f"Total of {num_page} listings found\n")

# Loop through each listing detail page
for detail_page in detail_pages:
    
    print(f"Scraping details from listing {i} of {num_page}")
    
    # Navigate to each href
    detail_url = f"https://www.realtor.com{detail_page}"
    
    browser.visit(detail_url)
    
    # HTML object
    html = browser.html

    # Parse HTML with Beautiful Soup
    detail_soup = bs(html, "html.parser")

    # Identify all listing-indicators
    details = detail_soup.find('div', attrs={"data-testid": "listing-indicator"})
    re_li = re.compile('rui*')
    
    try:
        for x in details.find_all('li', re_li):
            listing_indicators[x.find_all('div', attrs={'class': re_li})[0].text] = x.find_all('div', attrs={'class': re_li})[1].text
            
        try:
            property_type = listing_indicators['Property Type']
            types.append(property_type)
        except:
            types.append('No Info')
        try:
            hoa_fee = listing_indicators['HOA Fees'].strip('/mo')
            hoa_fee = hoa_fee.strip('$')
            fees.append(hoa_fee)
        except:
            fees.append('No Info')
        try:
            pricesqft = listing_indicators['Price per sqft'].strip('$')
            pricesqfts.append(pricesqft)
        except:
            pricesqfts.append('No Info') 
        try:
            garage = listing_indicators['Garage'].strip(' car')
            garage = garage.strip(' cars')
            garages.append(garage)
        except:
            garages.append('No Info')
        try:
            year = listing_indicators['Year Built']
            years.append(year)
        except:
            years.append('No Info')

    except:
        types.append('No Info')
        fees.append('No Info')
        pricesqfts.append('No Info')
        garages.append('No Info')
        years.append('No Info')
        
        sleep(15)
    
    i = i + 1
    
#     Generate random number between 2 to 10 seconds to wait before continuing loop
    sleep(randint(2,10))
    
print(f'\nScraping complete...{ctime()}')

Total of 4 listings found

Scraping details from listing 1 of 4
Scraping details from listing 2 of 4
Scraping details from listing 3 of 4
Scraping details from listing 4 of 4

Scraping complete...Sun Sep 26 16:42:10 2021


In [10]:
# Add attributes to dataframe
df = pd.DataFrame({'Address': addresses, 'Status': statuses, 'Property Type': types, 'Price': prices, 
                   'Price per sqft': pricesqfts, 'HOA Fees': fees, 'Bed': beds, 'Bath': baths, 'Size': sizes, 
                   'Garage': garages, 'Year Built': years, 'Load DateTime': ctime()})

# Extract address into Street, City, State, Zip
street_city = df['Address'].str.split(',', expand=True)
street_city = street_city.rename(columns={0: 'Street', 1: 'City', 2: 'state_zip'})
state_zip = street_city['state_zip'].str.split(' ', expand=True)
state_zip = state_zip.rename(columns={1: 'State', 2: 'Zip'})
street_city = street_city.drop(columns='state_zip')
state_zip = state_zip.drop(columns=0)

street_city.reset_index(drop=True, inplace=True)
state_zip.reset_index(drop=True, inplace=True)

# Create merged_df
address_df = pd.concat([street_city, state_zip], axis=1) 

merged_df = pd.concat([df, address_df], axis=1) 

In [None]:
merged_df.to_csv('listings.csv', index=False)

In [11]:
browser.quit()

In [12]:
merged_df

Unnamed: 0,Address,Status,Property Type,Price,Price per sqft,HOA Fees,Bed,Bath,Size,Garage,Year Built,Load DateTime,Street,City,State,Zip
0,"Opal Cyn, Duarte, CA 91010",Contingent,No Info,No Info,No Info,No Info,No Info,No Info,No Info,No Info,No Info,Sun Sep 26 16:42:21 2021,Opal Cyn,Duarte,CA,91010
1,"1820 Huntington Dr, Duarte, CA 91010",Pending,Condo,No Info,448,412,2,2.0,1060,2,1985,Sun Sep 26 16:42:21 2021,1820 Huntington Dr,Duarte,CA,91010
2,"2074 Goodall Ave, Duarte, CA 91010",Pending,Single Family,No Info,588,412,3,1.0,927,1,1952,Sun Sep 26 16:42:21 2021,2074 Goodall Ave,Duarte,CA,91010
3,"3514 Conata St, Duarte, CA 91010",Pending,Single Family,No Info,448,412,4,2.0,1564,2,1966,Sun Sep 26 16:42:21 2021,3514 Conata St,Duarte,CA,91010


In [None]:
listings_df = merged_df.rename(columns ={"Address":"full_address",
                                        "Status":"status",
                                        "Property Type":"property_type",
                                        "Price":"price",
                                        "Price per sqft":"price_per_sq_ft",
                                        "HOA Fees":"hoa_fees",
                                        "Bed":"bed",
                                        "Bath":"bath",
                                        "Size":"built_area",
                                        "Garage":"parking_garage",
                                        "Year Built":"year_built",
                                        "Street":"street_address",
                                        "City":"city",
                                        "State":"state_cd",
                                        "Zip":"zip_code",
                                        "Load DateTime":"load_datetime"})
listings_df

In [3]:
connection_string = f"{driver}://{username}:{password}@{host}:{port}/{database}"
engine = create_engine(connection_string)
connection = engine.connect()

In [None]:
listings_df.to_sql('listings',connection, if_exists='append', index=False)
listings_df.head()

In [None]:
new_listings_df = pd.read_sql_table('listings', connection)
new_listings_df