In [1]:
import pandas as pd
import numpy as np
from pycaret.regression import *
import psycopg2 as db
import json

In [2]:
def sql_build_connection(dbname):
    '''A simple function to derive a postgresql connection.

    Summary
    -------
    Create a postgresql connection. This assumes that there is a file
    called .sql in the user root folder with database credentials.

    Returns
    -------
    engine: sql engine
        A postgresql engine object
    '''
    # get credentials
    root = os.path.expanduser('~')
    with open(f'{root}/.sqluser', 'r') as f:
        creds = json.load(f)

    # build the connection
    connection = (f'''postgresql://{creds["uid"]}:{creds["pwd"]}@localhost:5432/{dbname}''')
    engine = db.connect(connection)

    return engine

In [3]:
def sql_get_data(connection, schema, table, columns='*', where='1=1', query=False):
    '''Get sql data into a pandas data frame.

    Summary
    -------
    A flexible function retrieving data from a database into a data frame. 

    Parameters
    ----------
    connection: database connection
    schema: string 
        The schema for the table
    table: string
        The table name
    columns: string
        Optional: a comma seperated list of column names
    where: string
        Optional: a where clause to filter the data
    query: string/boolean
        Optional: a more complex query with schema.table_name syntax
    
    Returns
    -------
    data: pandas.DataFrame
        A pandas data frame
    '''
    # build the sql query
    if query:
        sql = query
    else:
        sql = f'''SELECT {columns} FROM "{schema}"."{table}" WHERE {where}'''
    
    # pump it into a data frame
    data = pd.read_sql(sql, connection)
    
    return data

In [4]:
def build_the_model(df, target_variable, model_include, model_number, fold_number, exp_params, exp_suffix):
    
    # setup the experiment
    exp = setup(
        data=df,
        target=target_variable,
        transform_target=exp_params['transform_target'],
        transformation=exp_params['transformation'],
        ignore_features=exp_params['ignore_features'],
        normalize=exp_params['normalize'],
        numeric_imputation='median',
        remove_multicollinearity=exp_params['remove_multicollinearity'],
        pca=exp_params['pca'],
        log_experiment=True,
        experiment_name=f'{target_variable}{exp_suffix}',
        silent=True)
    
    # build models
    top_n = compare_models(fold=fold_number, whitelist=model_include, n_select=model_number, verbose=False)

    # blend the models (this is just model averaging)
    init_model = blend_models(estimator_list=top_n, verbose=False)
    train_out = pull().loc['Mean', ['MAE', 'RMSE', 'R2']]

    # predict model against test data
    pred_model = predict_model(init_model)
    pred_out = pull().loc[0, ['MAE', 'RMSE', 'R2']]

    # comparison table for train vs test
    stats = pd.concat([train_out, pred_out], axis=1)
    stats.columns = ['train', 'test']
    
    # finalize the model
    fin_model = finalize_model(init_model)

    # save the model
    save_model(fin_model, f'../data/{target_variable}{exp_suffix}')
    print('----------------------------------------------------------')

    return fin_model, stats


In [5]:
#### spec ####

# Each listed columns will have a model calibrated to predict values for supressed cells
mod_col = ['lbf', 'employed', 'employed_full', 'employed_part', 'unemployed', 'not_in_lbf', 'naics_employed', 'naics_goods', 'naics_agriculture', 'naics_natural_resources', 'naics_ultilities', 'naics_construction', 'naics_manufacturing', 'naics_services', 'naics_retail', 'naics_transport_warehousing', 'naics_financial', 'naics_professional', 'naics_support_services', 'naics_education_services', 'naics_health', 'naics_infor_culture', 'naics_accomdation_food', 'naics_other', 'naics_public_admin', 'nocs_employed', 'nocs_management', 'nocs_management_senior', 'nocs_management_specialized', 'nocs_management_retail', 'nocs_management_trades', 'nocs_business', 'nocs_business_pro', 'nocs_business_admin', 'nocs_business_finance', 'nocs_business_office', 'nocs_business_distribution', 'nocs_sciences', 'nocs_sciences_pro', 'nocs_sciences_tech', 'nocs_health', 'nocs_health_nursing', 'nocs_health_pro', 'nocs_health_tech', 'nocs_health_assist', 'nocs_public', 'nocs_public_education', 'nocs_public_pro', 'nocs_public_para', 'nocs_public_protection', 'nocs_public_support', 'nocs_culture', 'nocs_culture_pro', 'nocs_culture_tech', 'nocs_sales', 'nocs_sales_retail', 'nocs_sales_service', 'nocs_sales_wholesale', 'nocs_sales_other', 'nocs_sales_support', 'nocs_sales_nec', 'nocs_trades', 'nocs_trades_industrial', 'nocs_trades_maintenance', 'nocs_trades_other', 'nocs_trades_heavy', 'nocs_trades_helpers', 'nocs_agriculture', 'nocs_agriculture_supervisor', 'nocs_agriculture_workers', 'nocs_agriculture_harvesting', 'nocs_manufacturing', 'nocs_manufacturing_supervisor', 'nocs_manufacturing_operators', 'nocs_manufacturing_assemblers', 'nocs_manufacturing_labourers']

# model types to consider
# lr: linear regression, et: extra trees, xgboost: extreme gradient boosting, br: baysian ridge, knn: k-nearest neighbours
mod_consider = ['lr', 'et', 'xgboost', 'br', 'svm', 'knn']

# number of models to select
# the top n models will be blended
n_mod = 3

# numbe for folds to use in model calibration / validation
n_fold = 5 

# experiment label 
labl = '_test'

# experiment parameters
params = {
    "ignore_features": ['id'],
    "transform_target": True,
    "transformation": True,
    "normalize": True,
    "remove_multicollinearity": True,
    "pca": False
}

In [6]:
# get the count and pct datasets for running imputation process
con = sql_build_connection('analytics')
cnt_data = sql_get_data(con, 'dev', 'all_imputation_input')

In [21]:
compiled_models_stats = pd.DataFrame()

for attribute in mod_col:
    model, model_stats = build_the_model(cnt_data, attribute, mod_consider, n_mod, n_fold, params, labl)
    model_stats['target_variable'] = attribute
    compiled_models_stats = pd.concat([compiled_models_stats, model_stats], axis=0)


Transformation Pipeline and Model Succesfully Saved
----------------------------------------------------------


In [22]:
pd.set_option('display.max_rows', None)
compiled_models_stats

Unnamed: 0,train,test,target_variable
MAE,13.6516,15.1656,lbf
RMSE,37.5391,45.1087,lbf
R2,0.9907,0.9917,lbf
MAE,11.2576,14.5036,employed
RMSE,29.0588,45.4268,employed
R2,0.9947,0.979,employed
MAE,10.2513,12.386,employed_full
RMSE,27.9116,36.1371,employed_full
R2,0.9896,0.9929,employed_full
MAE,3.9496,3.0684,employed_part
