In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
from pycaret.regression import *
import psycopg2 as db
import json
import os

In [2]:
def sql_build_connection(dbname):
    '''A simple function to derive a postgresql connection.

    Summary
    -------
    Create a postgresql connection. This assumes that there is a file
    called .sql in the user root folder with database credentials.

    Returns
    -------
    engine: sql engine
        A postgresql engine object
    '''
    # get credentials
    root = os.path.expanduser('~')
    with open(f'{root}/.sqluser', 'r') as f:
        creds = json.load(f)

    # build the connection
    connection = (f'''postgresql://{creds["uid"]}:{creds["pwd"]}@localhost:5432/{dbname}''')
    engine = db.connect(connection)

    return engine

In [3]:
def sql_get_data(connection, schema, table, columns='*', where='1=1', query=False):
    '''Get sql data into a pandas data frame.

    Summary
    -------
    A flexible function retrieving data from a database into a data frame. 

    Parameters
    ----------
    connection: database connection
    schema: string 
        The schema for the table
    table: string
        The table name
    columns: string
        Optional: a comma seperated list of column names
    where: string
        Optional: a where clause to filter the data
    query: string/boolean
        Optional: a more complex query with schema.table_name syntax
    
    Returns
    -------
    data: pandas.DataFrame
        A pandas data frame
    '''
    # build the sql query
    if query:
        sql = query
    else:
        sql = f'''SELECT {columns} FROM "{schema}"."{table}" WHERE {where}'''
    
    # pump it into a data frame
    data = pd.read_sql(sql, connection)
    
    return data

In [4]:
def sql_csv_loader(csv_path, target_table, connect):
    '''Load csv data into database using pandas and sqlalchemy.

    Summary
    -------
    Take the file path to a csv dataset, load the csv into pandas dataframe, then pipe into database using sqlachemy.

    Parameters
    ----------
    csv_path: string
        The Fully qualify path to the csv data to be loaded into sql
    target_table: mapper
        The table object for the target table in the database
    conncet: database connection to be used
        Database engine to be used
    '''
    # build the cursor object
    cursor = connect.cursor()

    # run the data load
    truncate = f'''TRUNCATE TABLE {target_table};'''
    copy_cmd = f'''COPY {target_table} FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',');'''
    with open(csv_path, 'r') as file_:
        cursor.execute(truncate)
        cursor.copy_expert(sql=copy_cmd, file=file_)
        connect.commit()
        cursor.close()
    print (f'The data from {csv_path} has been written to {target_table}')


In [5]:
#### spec ####

# Each listed columns will have a model calibrated to predict values for supressed cells
mod_col = ['lbf', 'employed', 'employed_full', 'employed_part', 'unemployed', 'not_in_lbf', 'naics_employed', 'naics_goods', 'naics_agriculture', 'naics_natural_resources', 'naics_ultilities', 'naics_construction', 'naics_manufacturing', 'naics_services', 'naics_retail', 'naics_transport_warehousing', 'naics_financial', 'naics_professional', 'naics_support_services', 'naics_education_services', 'naics_health', 'naics_infor_culture', 'naics_accomdation_food', 'naics_other', 'naics_public_admin', 'nocs_employed', 'nocs_management', 'nocs_management_senior', 'nocs_management_specialized', 'nocs_management_retail', 'nocs_management_trades', 'nocs_business', 'nocs_business_pro', 'nocs_business_admin', 'nocs_business_finance', 'nocs_business_office', 'nocs_business_distribution', 'nocs_sciences', 'nocs_sciences_pro', 'nocs_sciences_tech', 'nocs_health', 'nocs_health_nursing', 'nocs_health_pro', 'nocs_health_tech', 'nocs_health_assist', 'nocs_public', 'nocs_public_education', 'nocs_public_pro', 'nocs_public_para', 'nocs_public_protection', 'nocs_public_support', 'nocs_culture', 'nocs_culture_pro', 'nocs_culture_tech', 'nocs_sales', 'nocs_sales_retail', 'nocs_sales_service', 'nocs_sales_wholesale', 'nocs_sales_other', 'nocs_sales_support', 'nocs_sales_nec', 'nocs_trades', 'nocs_trades_industrial', 'nocs_trades_maintenance', 'nocs_trades_other', 'nocs_trades_heavy', 'nocs_trades_helpers', 'nocs_agriculture', 'nocs_agriculture_supervisor', 'nocs_agriculture_workers', 'nocs_agriculture_harvesting', 'nocs_manufacturing', 'nocs_manufacturing_supervisor', 'nocs_manufacturing_operators', 'nocs_manufacturing_assemblers', 'nocs_manufacturing_labourers']

# experiment label 
labl = '_test'

In [6]:
# get the count datasets for running imputation process
con = sql_build_connection('analytics')
cnt_data = sql_get_data(con, 'dev', 'all_imputation_input')

# copy of the imputation dataset without t-minus features will be used to store the outputs from the models
cols_keep = np.logical_not(cnt_data.columns.str.startswith('tm_'))
out_data = cnt_data.loc[:, cols_keep]

In [7]:
# load each model generate prediction and save to dataframe
for col in mod_col:
    model_source = f'../data/{col}{labl}'
    model = load_model(model_source, verbose=False)
    pred = predict_model(model, data=cnt_data)
    out_data[col] = pred['Label']

In [8]:
# write versioned table csv
date = dt.now().strftime("%Y%m%d")
path = os.path.abspath(f'../data/all_imputation_out_{date}.csv')
#out_data.to_csv(path, index=False)

In [9]:
# create sql table

# sql statements
dropper = 'DROP TABLE IF EXISTS dev.all_imputation_result;'
creator = f"CREATE TABLE dev.all_imputation_result AS SELECT id, {(', '.join(mod_col))} FROM dev.all_wide_raw WITH NO DATA;"

# execute sql to create
con = sql_build_connection('analytics')
cursor = con.cursor()
cursor.execute(dropper)
cursor.execute(creator)

# commit that
con.commit()
cursor.close()

In [10]:
sql_csv_loader(path, 'dev.all_imputation_result', con)

The data from /Users/sean/Projects/pycaret-exploration/data/all_imputation_out_20200909.csv has been written to dev.all_imputation_result


In [11]:
# load csv into table
con.close()