# ETL Project
## Irvine Company Apartments


In [1]:
# set environment
import pandas as pd
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

from sqlalchemy import create_engine, insert
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from config import password

In [2]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Driver [C:\Users\Tolga\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache






## Build URL for scraping cost-of-living index
- Near-future release: Use drop down for chosing region and date

In [3]:
# Base url for cost of living index by city from regional rankings
base_url = "https://www.numbeo.com/cost-of-living/region_rankings.jsp"

# Choose region
region_idx = 21   # 21 is the North America region
# Date
year = 2021       # Year of the acquisition
mid_year = False  # Not sure what this means, but there's an option to choose

In [4]:
# Date string
date_str = "%d" % (year)
if mid_year:
    date_str += "-mid"

# Region string
region_str = "%03d" % (region_idx)

# build URL for scraping 
cost_of_living_idx_url = base_url + f"?title={date_str}&region={region_str}"

In [5]:
browser.visit(cost_of_living_idx_url)
html = browser.html
# soup = BeautifulSoup(html, "html.parser")


In [6]:
# Scrape the html table into a pandas dataframe
rankings_df = pd.read_html(html)[1]
# Split the City entries into city and state
address_df = rankings_df["City"].str.split(", ", expand=True)
# Add the splitted entry into the dataframe
rankings_df["City"] = address_df[0]    # replaces the old city entry
rankings_df["State"] = address_df[1]   # adds a column State

# Keep only the cities inside United States
rankings_df = rankings_df[address_df[2] == "United States"]
# Keep only California cities
rankings_df_CA = rankings_df[rankings_df["State"] == "CA"]
# Reindex using the cities
rankings_df_CA.set_index("City", inplace=True)

## Add the cost of living data to the database

In [7]:
# connect to SQL database
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/ETL_IrvineCoApts_db')
connection = engine.connect()

In [8]:
# Reflect an existing database into a new model
Base = automap_base()
Base.prepare(engine, reflect=True)

In [9]:
# create references to our tables
Cities = Base.classes.cities

In [10]:
# initiate a database session
session = Session(connection)

In [11]:
# Add the cities information to the database
for name in rankings_df_CA.index:
    results = session.query(Cities.city_id).filter(Cities.city_name == name).first()
    cost_of_living = rankings_df_CA.loc[name]["Cost of Living Index"]
    if results == None:   # City is not present in the database
        new_city = Cities(city_name = name, cost_of_living = cost_of_living, med_income = 0, avg_rent = 0, population = 0)
        session.add(new_city)
    else:     # Update the cost-of-living value
        city_id = results[0]
        session.query(Cities)\
                      .filter(Cities.city_id == city_id)\
                      .update({Cities.cost_of_living: cost_of_living})
session.commit()
session.close()