In [None]:
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set()
%matplotlib inline


## What are we doing here

### Things to do
<ol>
    <li>Read differnet info from CSVs</li>
    <li>Create quick data dictionary</li>
    <li>Clean the data</li>
    <li>Decide what we import to the DB</li>
    <li>Load data into amazon postgres database</li>
    <ol style="list-style-type: lower-alpha; padding-bottom: 0;">
      <li style="margin-left:1em">How to connect / start a session</li>
      <li style="margin-left:1em">How to define schema</li>
      <li style="margin-left:1em">How to create a table</li>
      <li style="margin-left:1em; padding-bottom: 0;">How to load data</li>
     </ol>
    <li>Write some good queries</li>
    <li>Create an ETL of Occupancy using jupyter</li> 
</ol>

In [None]:
files = glob("Reservations/*.csv")
files = files
files

In [None]:
# The RIDB segments the sales data by year, lets check to make sure it has the same info in each file
columns_data = {}

for file in files:
    df = pd.read_csv(file, nrows=10)
    columns_data[file] = df.columns.values

# after printing the columns_data we see that it has the same columns in each sales file
    

In [None]:
# lets write some quick functions to process our data

def read_data(file):
    print(file)
    return pd.read_csv(file, low_memory=False)

def trim_select_data(df):
    """quick funciton to clean up our data"""
    
    #first trim to only cols we care about
    tgt_cols = ['HistoricalReservationID', 'OrderNumber', 'Agency', 'OrgID',
       'CodeHierarchy', 'RegionCode', 'RegionDescription',
       'ParentLocationID', 'ParentLocation', 'LegacyFacilityID', 'Park',
       'SiteType', 'UseType', 'ProductID', 'EntityType', 'EntityID',
       'FacilityID', 'FacilityZIP', 'FacilityState', 'FacilityLongitude',
       'FacilityLatitude', 'CustomerZIP', 'CustomerState',
       'CustomerCountry', 'TotalPaid', 'StartDate', 'EndDate', 'OrderDate',
       'NumberOfPeople']

    df = df.loc[:,tgt_cols]

    # next rename the data in the data frame
    col_names = ['historical_reservation_id', 'order_number', 'agency', 'orgid',
           'code_hierarchy', 'region_code', 'region_description',
           'parent_location_id', 'parent_location', 'legacy_facility_id',
           'park', 'site_type', 'use_type', 'product_id', 'entity_type',
           'entity_id', 'facility_id', 'facility_zip', 'facility_state',
           'facility_longitude', 'facility_latitude', 'customer_zip',
           'customer_state', 'customer_country', 'total_paid', 'start_date',
           'end_date', 'order_date', 'number_of_people']

    #rename the columns
    df.columns = col_names

    #Lets only select overnight stays at campsites
    df = df.loc[df.use_type == "Overnight",:]
    df = df.loc[df['entity_type'] == 'Site', :]
    
    # We will need to coerce the datas into datetime as some of the data isnt clean
    for x in ["start_date", "end_date", "order_date"]:
        df[x] = pd.to_datetime(df[x],errors="coerce", format="%Y-%m-%d")
    
    # for some reasons some of these reservations dont have facility ids, we will replace with -1 as a flag
    df.facility_id = df.facility_id.fillna(-1).astype(int)
    df.entity_id = df.entity_id.fillna(-1).astype(int)    
    
    return df


In [None]:
df = read_data(files[2])
df = trim_select_data(df)
df.shape

# for some reasons some transactions are here multiple times. Lets remove the duplicates
df.drop_duplicates(subset=['order_number'], keep='first', inplace=True)

In [None]:
# Lets just do this for California - 7 parks
df_CA = df.loc[df.facility_state == 'CA',:]

# now lets load this into a list of dictionaries that we can call into our data loader functions
records = df_CA.to_dict('records')

In [None]:
records_test = records[0:100]
records_test[0]

<h3>Some helpful tutorials on SQLAlchemy and loading data</h3>
<a href="https://www.freecodecamp.org/news/sqlalchemy-makes-etl-magically-easy-ab2bd0df928/">Fee Code Camp</a>

In [None]:
# Lets use the sqlalchemy module, create the table, then load up our data
import sqlalchemy
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Numeric, Boolean, DateTime, BigInteger
from sqlalchemy.orm import sessionmaker
 

# local postgresql://postgres:79zDvTF9zHfTNJoVQ@localhost/ridb_local
engine = create_engine("postgresql://postgres:79zDvTF9zHfTNJoVQ@localhost/ridb_local", echo = False) #Update with credientials
Base = declarative_base()

Session = sessionmaker(bind=engine)
session = Session()


meta = MetaData()

reservations = Table(
    "reservations", meta,
    Column('order_number', String, primary_key=True),
    Column('historical_reservation_id',Numeric),
    Column('agency',String),
    Column('orgid',Numeric),
    Column('code_hierarchy',String),
    Column('region_code',String),
    Column('region_description',String),
    Column('parent_location_id',Numeric),
    Column('parent_location',String),
    Column('legacy_facility_id',Numeric),
    Column('park',String),
    Column('site_type',String),
    Column('use_type',String),
    Column('product_id',Numeric),
    Column('entity_type',String),
    Column('entity_id',Numeric),    
    Column('facility_id',Numeric),
    Column('facility_zip',String),
    Column('facility_state',String),
    Column('facility_longitude',Numeric),
    Column('facility_latitude',Numeric),
    Column('customer_zip',String),
    Column('customer_state',String),
    Column('customer_country',String),
    Column('total_paid',Numeric),
    Column('start_date',DateTime),
    Column('end_date',DateTime),
    Column('order_date',DateTime),
    Column('number_of_people',Numeric)
)

meta.create_all(engine)

session.commit()


In [None]:
# After creating the table we want with sqlalchemy orm I realiezed it is a bit tough to test and execute insert statements. 
# for now im falling back on something I already understand well, psycopg.

import psycopg2

connection = psycopg2.connect(
    host="localhost", 
    dbname="ridb_local", 
    user="postgres", 
    password="79zDvTF9zHfTNJoVQ")

connection.autocommit = True


In [None]:
def insert_execute_batch(connection, records) -> None:
    with connection.cursor() as cursor:
        psycopg2.extras.execute_batch(cursor, """
                INSERT INTO reservations VALUES (
                    %(order_number)s,
                    %(historical_reservation_id)s,
                    %(agency)s,
                    %(orgid)s,
                    %(code_hierarchy)s,
                    %(region_code)s,
                    %(region_description)s,
                    %(parent_location_id)s,
                    %(parent_location)s,
                    %(legacy_facility_id)s,
                    %(park)s,
                    %(site_type)s,
                    %(use_type)s,
                    %(product_id)s,
                    %(entity_type)s,
                    %(entity_id)s,
                    %(facility_id)s,
                    %(facility_zip)s,
                    %(facility_state)s,
                    %(facility_longitude)s,
                    %(facility_latitude)s,
                    %(customer_zip)s,
                    %(customer_state)s,
                    %(customer_country)s,
                    %(total_paid)s,
                    %(start_date)s,
                    %(end_date)s,
                    %(order_date)s,
                    %(number_of_people)s
                );
            """, records)

In [None]:
insert_execute_batch(connection=connection, records=records)
