# ETL Project
## Irvine Company Apartments


In [1]:
# set environment
from bs4 import BeautifulSoup
from datetime import date
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
from sqlalchemy import create_engine, insert
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from config import password

In [3]:
# define a function that communicates with the browser
# and scrapes the page
def make_soup(url, browser, tag, class_name, find_all):
    browser.visit(url)
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    
    if find_all:
        return soup.find_all(tag, class_= class_name)
    else:
        return soup.find(tag, class_= class_name)

In [4]:
# connect to SQL database
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/ETL_IrvineCoApts_db')
connection = engine.connect()

In [5]:
# Reflect an existing database into a new model
Base = automap_base()
Base.prepare(engine, reflect=True)

In [6]:
# create references to our tables
Cities = Base.classes.cities
Complex = Base.classes.complex
Apartments = Base.classes.apartments

In [7]:
# initiate a database session
session = Session(connection)

In [8]:
# setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324






[WDM] - Driver [C:\Users\kate_\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


In [9]:
# define our base URL and derive location URL
base_url = "https://www.irvinecompanyapartments.com"
loc_url = base_url + "/locations"

In [10]:
# initialize empty lists for cities and complexes
complex_list = []
city_list = []

# scrape the locations page for all cities that have ICA complexes
location_soup = make_soup(loc_url, browser, "li", "submarket-listing-item__sub-market-list__item", True)

# loop through each one scraped
for loc in location_soup:
    
    # pull out the relative path for the city page
    city_path = loc.find("a", class_="link")["href"]
    # append city path to base URL
    city_url = base_url + city_path
    
    # now scrape the city page for each complex in that city
    complex_soup = make_soup(city_url, browser, "div", "search-result-item-card__cta-container--bottom", True)
    
    # loop through each one found
    for loc in complex_soup:
        # pull elements with an href tag
        links = loc.find("a", href=True)
        if links:
            # we found one, so pull out the URL
            x = str(links).split('href="')
            y = x[1].split(" ")
            complex_url = y[0].replace('"', '')
            

            # scrape the complex page for the complex name
            complex_soup = make_soup(complex_url, browser, "h1", "sticky-header__title-heading", False)

            if complex_soup:
                # found a complex name
                complex_name = complex_soup.text
                
                # save the complex URL and complex name for later processing
                complex_list.append([complex_url, complex_name])
                
                
                # derive the contact page for the complex
                contact_url = complex_url + "#contact"
            
                # look for the address of the complex
                contact_addr = make_soup(contact_url, browser, "div", "contactus-leasing-address", False).text
                complex_addr = contact_addr.split("\n")[1]
                
                # pull out the area/county and city name
                addr_pieces = complex_url.split("/")
                area_name = addr_pieces[4].replace("-", " ").title()
                city_name = addr_pieces[5].replace("-", " ").title()
                
                # San Diego gives us problems
                if area_name == "San Diego":
                    if city_name != "Carlsbad":
                        city_name = area_name
                        
                        
                # initialize city ID before we start checking the database
                city_id = 0
                
                # check to see if we've checked the database for this city before
                if city_name not in city_list:
                    
                    # new city name.  pull the city id from the database
                    result = session.query(Cities.city_id).filter(Cities.city_name == city_name).first()
                    if result is None:
                        # not in the database yet, so insert
                        new_city = Cities(city_name = city_name, population = 0, cost_of_living = 0, \
                                          median_income = 0, median_age = 0)
                        session.add(new_city)
        
                    session.commit()
                    # save the city name so we don't check again
                    city_list.append(city_name)
                
                # we had a new city, so we haven't retrieved the id yet
                if city_id == 0:
                    result = session.query(Cities.city_id).filter(Cities.city_name == city_name).first()
                    city_id = result[0]
                    
                # check database for existence of complex
                result = session.query(Complex.complex_id).filter(Complex.complex_name == complex_name).first()
                if result is None:
                    # new complex, so insert
                    new_complex = Complex(complex_name=complex_name, complex_address=complex_addr, complex_url=complex_url, city_id=city_id)
                    session.add(new_complex)
                    
                    
                session.commit()
        

    

In [11]:
# loop through list of complexes and build URL for scraping
# available units
for complex_entry in complex_list:
    
    complex_url = complex_entry[0]
    complex_name = complex_entry[1]
    
    # get complex id from database
    result = session.query(Complex.complex_id).filter(Complex.complex_name == complex_name).first()
    complex_id = result[0]

    # derive URL for available units page for current complex
    avail_url = complex_url.replace(".html", "/availability.html#floor-plan-list")
    
    # scrape availablity page
    floor_plans = make_soup(avail_url, browser, "div", "fapt-fp-list-item", True)

    # loop through each floor plan found
    for plan in floor_plans:

        # Error handling
        try:
            # collect floor plan level data
            plan_name = plan.find('div', class_="fapt-fp-list-item__column--plan-name").text
            unit_type = plan.find('div', class_="fapt-fp-list-item__column--beds-baths").text
            start_price = plan.find('div', class_="fapt-fp-list-item__column--price").text
            sq_ft = plan.find('div', class_="fapt-fp-list-item__column--sqft").text
            
            # remove symbols from price string and square footage strings
            if start_price == "Call for pricing":
                start_price = "0"
            start_price = start_price.replace("$", "")
            start_price = start_price.replace(",", "")
            
        
            # scrape all units listed
            units = plan.find_all("div", class_="fapt-fp-unit__table-row")
        
            for unit in units:
                # now scrape the attributes for each unit
                unit_name = unit.find("span", class_="fapt-fp-unit__unit-name-text")

                if unit_name:
                    # default vacant to False
                    vacant = False
                    unit_id = unit_name.text
                    terms = unit.find("div", class_="fapt-fp-unit__column-inner--term").text
                    curr_price = unit.find("div", class_="fapt-fp-unit__column-inner--price").text
                    avail_date = unit.find("div", class_="fapt-fp-unit__column-inner--available").span.text
                    
                    # remove symbols from current price                
                    curr_price = curr_price.replace("$", "")
                    curr_price = curr_price.replace(",", "")
                    
                    # if either of our prices are still non-numeric, set to 0
                    if curr_price.isnumeric() == False:
                        curr_price = "0"
                        
                   
                    # grab current date
                    today = date.today()
                    curr_date = today.strftime("%m/%d/%Y")
                    list_start_date = curr_date
                    
                    # make sure the available date is a date, or
                    # if the date is past, set unit to vacant
                    if avail_date == "Today" or avail_date <= curr_date:
                        avail_date = curr_date
                        vacant = True
                        
                        
                    # check database for existence of complex
                    result = session.query(Apartments.apartment_id).filter(Apartments.complex_id == complex_id) \
                                                                    .filter(Apartments.unit_id == unit_id).first()
                    if result is None:
                        # new unit, so insert
                        new_unit = Apartments(complex_id=complex_id, unit_id=unit_id, sq_ft=sq_ft, plan_name=plan_name, \
                                             apt_type=unit_type, start_price=int(start_price), vacant=vacant, \
                                             curr_price=int(curr_price), list_start_date=list_start_date, \
                                             available_date=avail_date, curr_date=curr_date)
                        session.add(new_unit)
                    else:
                        # already exists, so update
                        session.query(Apartments)\
                                .filter(Apartments.complex_id == complex_id)\
                                .filter(Apartments.unit_id == unit_id) \
                                .update({Apartments.curr_price: int(curr_price), Apartments.vacant: vacant, \
                                        Apartments.curr_date: curr_date})

                        

                    
                # commit data to database  
                session.commit()


        except AttributeError as e:
            print(e)
        

In [12]:
# close session
session.close()