### Gather dependencies

In [None]:
# Import Dependencies
import os
import pandas as pd
import requests
import json
import csv
import datetime as dt
from dateutil.relativedelta import relativedelta 
import psycopg2
import psycopg2.extras as extras 
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine


### Read CSV files and create initial DFs

In [2]:
# Read csv files
airbnb_csv = pd.read_csv("./Data/AB_US_2020.csv", low_memory=False)
airports_csv = pd.read_csv("./Data/airports.csv", low_memory=False)
airbnb = airbnb_csv
airports = airports_csv
airbnb.head()
airports.head()

Unnamed: 0,IATA,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABQ,Albuquerque International,Albuquerque,NM,USA,35.040222,-106.609194
1,ANC,Ted Stevens Anchorage International,Anchorage,AK,USA,61.17432,-149.996186
2,ATL,William B Hartsfield-Atlanta Intl,Atlanta,GA,USA,33.640444,-84.426944
3,AUS,Austin-Bergstrom International,Austin,TX,USA,30.194533,-97.669872
4,BDL,Bradley International,Windsor Locks,CT,USA,41.938874,-72.683228


### Create engine from SQL Alchemy for PostgreSQL

In [None]:
# Create connection to PostgreSQL
from config import username, password, port, dbase
engine = create_engine(f'postgresql://{username}:{password}@localhost:{port}/{dbase} ')      

### Populate staging tables

In [4]:
# SQL insert functiont to insert airbnb_csv into a PostgreSQL table.

def execute_values(conn, df, table):

	tuples = [tuple(x) for x in df.to_numpy()]

	cols = ','.join(list(df.columns))
	# SQL query to execute
	query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
	cursor = conn.cursor()
	try:
		extras.execute_values(cursor, query, tuples)
		conn.commit()
	except (Exception, psycopg2.DatabaseError) as error:
		print("Error: %s" % error)
		conn.rollback()
		cursor.close()
		return 1
	print("the dataframe is inserted")
	cursor.close()


conn = psycopg2.connect(
	database="AirBnB", user='postgres', password='bootcamp', host='127.0.0.1', port='5432'
)

df = pd.read_csv('./Data/AB_US_2020.csv', low_memory=False)

execute_values(conn, df, 'stg_airbnbs')

the dataframe is inserted


In [5]:
## SQL insert functiont to insert airports into a PostgreSQL table.

def execute_values(conn, df, table):

	tuples = [tuple(x) for x in df.to_numpy()]

	cols = ','.join(list(df.columns))
	# SQL query to execute
	query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
	cursor = conn.cursor()
	try:
		extras.execute_values(cursor, query, tuples)
		conn.commit()
	except (Exception, psycopg2.DatabaseError) as error:
		print("Error: %s" % error)
		conn.rollback()
		cursor.close()
		return 1
	print("the dataframe is inserted")
	cursor.close()


conn = psycopg2.connect(
	database="AirBnB", user='postgres', password='bootcamp', host='127.0.0.1', port='5432'
)

df = pd.read_csv('./Data/airports.csv', usecols = ['IATA', 'AIRPORT', 'CITY','STATE','LATITUDE','LONGITUDE'], low_memory=False)

execute_values(conn, df, 'stg_airports')

the dataframe is inserted


In [8]:
# US Cities
cities = pd.read_csv('./Data/uscities.csv', usecols=['city', 'lat', 'lng'])
cities = cities.rename(columns={
    'city':'city',
    'lat':'latitude',
    'lng':'longitude'
} )
cities.head()

Unnamed: 0,city,latitude,longitude
0,New York,40.6943,-73.9249
1,Los Angeles,34.1141,-118.4068
2,Chicago,41.8375,-87.6866
3,Miami,25.784,-80.2101
4,Dallas,32.7935,-96.7667


### Load US cities to DB

In [9]:
# Load US cities to database
def execute_values(conn, df, table):

	tuples = [tuple(x) for x in df.to_numpy()]

	cols = ','.join(list(df.columns))
	# SQL query to execute
	query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
	cursor = conn.cursor()
	try:
		extras.execute_values(cursor, query, tuples)
		conn.commit()
	except (Exception, psycopg2.DatabaseError) as error:
		print("Error: %s" % error)
		conn.rollback()
		cursor.close()
		return 1
	print("the dataframe is inserted")
	cursor.close()


conn = psycopg2.connect(
	database="AirBnB", user='postgres', password='bootcamp', host='127.0.0.1', port='5432'
)

df = cities.drop_duplicates()

execute_values(conn, df, 'us_cities')

the dataframe is inserted


### Load Dimension/Fact tables

In [10]:
# Hosts DF
hosts_df = airbnb[['host_id', 'host_name', 'calculated_host_listings_count']]
# Get unique values
hosts_df.drop_duplicates()
hosts_df.head()

Unnamed: 0,host_id,host_name,calculated_host_listings_count
0,165529,Evelyne,1
1,427027,Celeste,11
2,320564,Lisa,2
3,746673,BonPaul,5
4,769252,Elizabeth,1


In [11]:
# Room Type DF
room_type_df = airbnb[['room_type']]
room_type_df.drop_duplicates().reset_index()

Unnamed: 0,index,room_type
0,0,Private room
1,1,Entire home/apt
2,100,Hotel room
3,185,Shared room


In [12]:
# Room type dimension load
def execute_values(conn, df, table):

	tuples = [tuple(x) for x in df.to_numpy()]

	cols = ','.join(list(df.columns))
	# SQL query to execute
	query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
	cursor = conn.cursor()
	try:
		extras.execute_values(cursor, query, tuples)
		conn.commit()
	except (Exception, psycopg2.DatabaseError) as error:
		print("Error: %s" % error)
		conn.rollback()
		cursor.close()
		return 1
	print("the dataframe is inserted")
	cursor.close()


conn = psycopg2.connect(
	database="AirBnB", user='postgres', password='bootcamp', host='127.0.0.1', port='5432'
)

df = room_type_df.drop_duplicates()

execute_values(conn, df, 'room_types')

the dataframe is inserted


In [14]:
# Airports
apt = pd.read_csv('./Data/airports.csv', usecols=['IATA', 'AIRPORT', 'CITY', 'STATE', 'LATITUDE', 'LONGITUDE'], low_memory=False)
airports = apt.drop_duplicates()
airports = airports.rename(columns={
	'IATA':'iata',
	'AIRPORT':'airport_name',
	'CITY':'city',
	'STATE':'state',
	'LATITUDE':'latitude',
	'LONGITUDE':'longitude'

} )
# airports.head()

# Load Airports
def execute_values(conn, df, table):

	tuples = [tuple(x) for x in df.to_numpy()]

	cols = ','.join(list(df.columns))
	# SQL query to execute
	query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
	cursor = conn.cursor()
	try:
		extras.execute_values(cursor, query, tuples)
		conn.commit()
	except (Exception, psycopg2.DatabaseError) as error:
		print("Error: %s" % error)
		conn.rollback()
		cursor.close()
		return 1
	print("the dataframe is inserted")
	cursor.close()


conn = psycopg2.connect(
	database="AirBnB", user='postgres', password='bootcamp', host='127.0.0.1', port='5432'
)

df = airports

execute_values(conn, df, 'airports')

the dataframe is inserted


In [15]:
# Hosts dimension load
def execute_values(conn, df, table):

	tuples = [tuple(x) for x in df.to_numpy()]

	cols = ','.join(list(df.columns))
	# SQL query to execute
	query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
	cursor = conn.cursor()
	try:
		extras.execute_values(cursor, query, tuples)
		conn.commit()
	except (Exception, psycopg2.DatabaseError) as error:
		print("Error: %s" % error)
		conn.rollback()
		cursor.close()
		return 1
	print("the dataframe is inserted")
	cursor.close()


conn = psycopg2.connect(
	database="AirBnB", user='postgres', password='bootcamp', host='127.0.0.1', port='5432'
)

df = hosts_df.drop_duplicates()

execute_values(conn, df, 'hosts')

the dataframe is inserted


In [16]:
# Create airbnb fact table df
import pandas as pd
from sqlalchemy import create_engine
from config import username, password, port, dbase
engine = create_engine(f'postgresql://{username}:{password}@localhost:{port}/{dbase}')
connection = engine.connect()

df1 = pd.read_sql('select * from stg_airbnbs', connection)
df2 = pd.read_sql('select id, host_id, host_name from hosts', connection)
df3 = pd.read_sql('select room_id, room_type from room_types', connection)
df3.head()
merge1 = pd.merge(df1, df2, how='left', on='host_id')
merge2 = pd.merge(merge1, df3, on='room_type', how='outer')
merge2.columns
airbnb_df = merge2[['name', 'host_id', 'latitude', 'longitude', 'room_id', 'price', \
    'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'city']]
# Rename columns to align with table
airbnb_df = airbnb_df.rename(columns={
    'name': 'airbnb_name',
    'host_id': 'host_id'
} )

airbnb_df.head()

Unnamed: 0,airbnb_name,host_id,latitude,longitude,room_id,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365,city
0,Charming Victorian home - twin beds + breakfast,165529,35.65146,-82.62792,1,60.0,1,138,1.14,0,Asheville
1,Historic Grove Park,769252,35.61442,-82.54127,1,125.0,30,58,0.52,0,Asheville
2,Blue Gate West,1098412,35.58345,-82.59713,1,48.0,1,137,1.35,0,Asheville
3,Asheville Dreamer's Cabin,1292070,35.59635,-82.50655,1,65.0,3,57,0.53,106,Asheville
4,Walk Downtown private bath peaceful,12874214,35.60371,-82.55621,1,85.0,2,338,3.36,0,Asheville


In [18]:
#Insert into airbnb table

def execute_values(conn, df, table):

	tuples = [tuple(x) for x in df.to_numpy()]

	cols = ','.join(list(df.columns))
	# SQL query to execute
	query = "INSERT INTO %s(%s) VALUES %%s" % (table, cols)
	cursor = conn.cursor()
	try:
		extras.execute_values(cursor, query, tuples)
		conn.commit()
	except (Exception, psycopg2.DatabaseError) as error:
		print("Error: %s" % error)
		conn.rollback()
		cursor.close()
		return 1
	print("the dataframe is inserted")
	cursor.close()


conn = psycopg2.connect(
	database="AirBnB", user='postgres', password='bootcamp', host='127.0.0.1', port='5432'
)

df = airbnb_df.drop_duplicates()

execute_values(conn, df, 'airbnbs')


the dataframe is inserted
