In [None]:
import pandas as pd
import numpy as np
from pycaret.regression import *
import psycopg2 as db
import json

In [None]:
def sql_build_connection(dbname):
    '''A simple function to derive a postgresql connection.

    Summary
    -------
    Create a postgresql connection. This assumes that there is a file
    called .sql in the user root folder with database credentials.

    Returns
    -------
    engine: sql engine
        A postgresql engine object

    '''
    # get credentials
    root = os.path.expanduser('~')
    with open(f'{root}/.sqluser', 'r') as f:
        creds = json.load(f)

    # build the connection
    connection = (f'''postgresql://{creds["uid"]}:{creds["pwd"]}@localhost:5432/{dbname}''')
    engine = db.connect(connection)

    return engine

In [None]:
def sql_get_data(connection, schema, table, columns='*', where='1=1', query=False):
    '''Get sql data into a pandas data frame.

    Summary
    -------
    A flexible function retrieving data from a database into a data frame. 

    Parameters
    ----------
    connection: database connection
    schema: string 
        The schema for the table
    table: string
        The table name
    columns: string
        Optional: a comma seperated list of column names
    where: string
        Optional: a where clause to filter the data
    query: string/boolean
        Optional: a more complex query with schema.table_name syntax
    
    Returns
    -------
    data: pandas.DataFrame
        A pandas data frame

    '''
    # build the sql query
    if query:
        sql = query
    else:
        sql = f'''SELECT {columns} FROM "{schema}"."{table}" WHERE {where}'''
    
    # pump it into a data frame
    data = pd.read_sql(sql, connection)
    
    return data

In [None]:
# get the count and pct datasets for running imputation process
con = sql_build_connection('analytics')
cnt_data = sql_get_data(con, 'dev', 'all_wide_raw')
pct_data = sql_get_data(con, 'dev', 'all_pct_raw')

In [None]:
# replace 0 with nan to make it easier to do imputations
cnt_data_ = cnt_data.replace({0: np.nan})
pct_data_ = pct_data.replace({0: np.nan})

In [33]:
cnt_impute_ex1 = setup(
    cnt_data_,
    target='lbf',
    train_size=0.6,
    ignore_features=['ref_year', 'code', 'geo_name'], 
    normalize=True, normalize_method='zscore',
    numeric_imputation='median',
    remove_multicollinearity=True, multicollinearity_threshold=0.8)


Setup Succesfully Completed.


Unnamed: 0,Description,Value
0,session_id,8051
1,Transform Target,False
2,Transform Target Method,
3,Original Data,"(1425, 83)"
4,Missing Values,True
5,Numeric Features,77
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,
