$$\huge{\text{Practice with Greenplum and MADlib}}$$

In a previous session we saw examples of how to do regression and classification in Greenplum as applied to the Abalone dataset.  Now let's practice applying those techniques to predicting insurance claims. We will use a [sample insurance claims dataset](https://www.kaggle.com/easonlai/sample-insurance-claim-prediction-dataset#insurance3r2.csv) from Kaggle. 

| Column # | Column name |
|  ------  |  ---------  |
|    1     | age |
|    2     | sex |
|    3     | bmi |
|    4     | steps |
|    5     | children |
|    6     | smoker |
|    7     | region |
|    8     | charges |
|    9     | insuranceclaim |

In [None]:
import dbconnect

In [None]:
db_credential_file = '../.dbcred'
dbconnect.connect_and_register_sql_magic(
    db_credential_file,
    conn_name='conn'
)

In [None]:
import math
import pandas as pd
from sqlalchemy import create_engine

In [None]:
pd.set_option('display.max_columns', 200)

In [None]:
schema = 'ds_practice'

In [None]:
%read_sql DROP SCHEMA IF EXISTS {schema} CASCADE;
%read_sql CREATE SCHEMA {schema};

# Load Data

In [None]:
claims_data_path = '../input/insurance3r2.csv'

In [None]:
df_claims = pd.read_csv(claims_data_path)

In [None]:
df_claims.info()

In [None]:
df_claims.to_sql(
    'claims', 
    conn, 
    schema=schema, 
    if_exists='replace', 
    index=True, 
    index_label='id',
    chunksize=10000)

# Explore

In [None]:
%%read_sql
DROP TABLE IF EXISTS {schema}.claims_summary CASCADE;
SELECT madlib.summary(
    '{schema}.claims',   -- source_table
    '{schema}.claims_summary',  -- output_table
    NULL,  -- target_cols
    NULL,  -- grouping_cols
    TRUE,  -- get_distinct
    TRUE,  -- get_quartiles
    NULL,  -- ntile_array
    10,  -- how_many_mfv
    FALSE  -- get_estimates
)

In [None]:
%%read_sql
SELECT * FROM {schema}.claims_summary
ORDER BY column_number

# Encode categorical variables

Convert region to `varchar` data type so the categorical encoding will work

In [None]:
%%read_sql
DROP TABLE IF EXISTS {schema}.claims_region_text CASCADE;
CREATE TABLE {schema}.claims_region_text
AS
SELECT
    age,
    sex,
    bmi,
    steps,
    children,
    smoker,
    region::varchar as region,
    charges,
    insuranceclaim,
    insuranceclaim > 0 as insuranceclaim_bool
FROM {schema}.claims

In [None]:
%%read_sql
DROP TABLE IF EXISTS {schema}.claims_encoded CASCADE;
SELECT madlib.encode_categorical_variables(
    '{schema}.claims_region_text',  -- input table
    '{schema}.claims_encoded',  -- output table
    'region'   -- categorical_cols
)

In [None]:
%%read_sql
SELECT * FROM {schema}.claims_encoded
LIMIT 5

In [None]:
%%read_sql
DROP TABLE IF EXISTS {schema}.claims_correlations CASCADE;
DROP TABLE IF EXISTS {schema}.claims_correlations_summary CASCADE;
SELECT
madlib.correlation(
    '{schema}.claims_encoded', -- source_table,
    '{schema}.claims_correlations', -- output_table,
    'age,sex,bmi,steps,children,smoker,charges,insuranceclaim,region_0,region_1,region_2,region_3', -- target_cols,
    TRUE, -- verbose,
    NULL  -- grouping_columns
)

In [None]:
%%read_sql
SELECT * FROM {schema}.claims_correlations
ORDER BY
    column_position

In [None]:
%%read_sql
DROP TABLE IF EXISTS {schema}.claims_eval CASCADE;
DROP TABLE IF EXISTS {schema}.claims_eval_train CASCADE;
DROP TABLE IF EXISTS {schema}.claims_eval_test CASCADE;
SELECT madlib.train_test_split(
    '{schema}.claims_encoded', -- source_table,
    '{schema}.claims_eval', -- output_table,
    0.7, -- train_proportion,
    NULL, -- test_proportion,
    NULL, -- grouping_cols,
    'age,sex,bmi,steps,children,smoker,charges,region_0,region_1,region_2,region_3,insuranceclaim,insuranceclaim_bool', -- target_cols,
    FALSE, -- with_replacement,
    TRUE -- separate_output_tables
)

In [None]:
%%read_sql
SELECT count(*) as n
FROM {schema}.claims_eval_train

In [None]:
%%read_sql
SELECT count(*) as n
FROM {schema}.claims_eval_test

# Modeling (Classification)

## Logistic Regression

Note: drop one of the 1-hot-encoded variables (region) to remove perfect collinearity

In [None]:
%%read_sql
DROP TABLE IF EXISTS {schema}.claims_logreg_model;
DROP TABLE IF EXISTS {schema}.claims_logreg_model_summary;
SELECT madlib.logregr_train(
    '{schema}.claims_eval_train', -- source_table,
    '{schema}.claims_logreg_model', -- out_table,
    'insuranceclaim_bool', -- dependent_varname,
    'ARRAY[
        1,
        age,
        sex,
        bmi,
        steps,
        children,
        smoker,
        charges,
        region_0,
        region_1,
        region_2
    ]' -- independent_varname,
    --, -- grouping_cols,
    --, -- max_iter,
    --, -- optimizer,
    --, -- tolerance,
     -- verbose
)

In [None]:
%%read_sql
SELECT * FROM {schema}.claims_logreg_model
LIMIT 10

In [None]:
%%read_sql
DROP TABLE IF EXISTS {schema}.claims_logreg_test_proba CASCADE;
CREATE TABLE {schema}.claims_logreg_test_proba
AS
SELECT madlib.logregr_predict_prob(
        coef, 
        ARRAY[
            1,
            age,
            sex,
            bmi,
            steps,
            children,
            smoker,
            charges,
            region_0,
            region_1,
            region_2
        ] 
    ) as proba,
    test.insuranceclaim
FROM {schema}.claims_eval_test test, {schema}.claims_logreg_model model
;

In [None]:
%%read_sql
SELECT * FROM {schema}.claims_logreg_test_proba
LIMIT 10

In [None]:
%%read_sql
SELECT madlib.area_under_roc(
    '{schema}.claims_logreg_test_proba', -- table_in, 
    '{schema}.claims_logreg_test_auc',  --table_out,
    'proba',  -- prediction_col, 
    'insuranceclaim'  --observed_col, 
)

In [None]:
%%read_sql
SELECT *
FROM {schema}.claims_logreg_test_auc