In [1]:
"""
Author:shubham.gupta@zeno.health
Purpose: Tagging effect of campaign on customer
"""

'\nAuthor:shubham.gupta@zeno.health\nPurpose: Tagging effect of campaign on customer\n'

In [None]:
!pip install zeno_etl_libs_v3==1.0.1

In [2]:
import argparse
import os
import sys
from datetime import datetime as dt

# Model save
import joblib
import numpy as np
from dateutil.tz import gettz

sys.path.append('../../../..')

from zeno_etl_libs.logger import get_logger
from zeno_etl_libs.helper.aws.s3 import S3
from zeno_etl_libs.db.db import DB
from zeno_etl_libs.helper import helper

In [3]:
env = "dev"
email_to = ["shubham.gupta@zeno.health"]
schema = "public"

In [4]:
os.environ['env'] = env

In [5]:
logger = get_logger()

logger.info(f"env: {env}")

2022-12-29 18:05:10,371 - root - INFO - env: dev


In [6]:
read_schema = 'prod2-generico'
table_name = 'campaign-uplift'

rs_db = DB()
rs_db.open_connection()

table_info = helper.get_table_info(db=rs_db, table_name=table_name, schema=read_schema)

In [7]:
###################################################################
###################### Model Loading ##############################
###################################################################


bucket_name = 'aws-prod-glue-assets-921939243643-ap-south-1'
s3 = S3(bucket_name=bucket_name)
file_path_control = s3.download_file_from_s3('artifact/glue-jobs/src/scripts/uplift_modelling/dt_model_control.pkl')
file_path_test = s3.download_file_from_s3('artifact/glue-jobs/src/scripts/uplift_modelling/dt_model_test.pkl')

# Load the model from the file
clf_test = joblib.load(file_path_control)
clf_control = joblib.load(file_path_test)

path: /Users/shubham/PycharmProjects/GlueProject/etl/sagemaker-jobs/src/tmp/
path: /Users/shubham/PycharmProjects/GlueProject/etl/sagemaker-jobs/src/tmp/




In [8]:
###################################################################
###################### Data Preparation ###########################
###################################################################

data_q = """select
                T1.*,
                T2."mean-interval",
                T2."std-interval",
                T2."cov"
            from
                (
                select
                    "patient-id",
                    current_date - max("bill-date") as "recency",
                    count(id) as "frquency",
                    avg("total-spend") as "monetary",
                    max(case when "is-generic" = true then 1 else 0 end) as "is-generic",
                    max(case when "is-chronic" = true then 1 else 0 end) as "is-chronic",
                    (case
                        when min("p-promo-min-bill-date")= min("created-at") then 1
                        else 0
                    end) "is-promo-acquired",
                    max(case when "hd-flag" = true then 1 else 0 end) "is-hd",
                    max(case when "pr-flag" = true then 1 else 0 end) "is-pr",
                    max(case when "ecom-flag" = true then 1 else 0 end) "is-ecomm",
                    avg(case when "promo-code-id" is not null then 1.0 else 0.0 end) "redemption-perc"
                from
                    "prod2-generico"."retention-master" rm
                where
                    "bill-date" <= current_date
                group by
                    "patient-id") T1
            inner join 
                (
                select
                    "x1"."patient-id" as "patient-id",
                    AVG("x1"."purchase-interval") as "mean-interval",
                    STDDEV("x1"."purchase-interval") as "std-interval",
                    STDDEV("x1"."purchase-interval") / AVG("x1"."purchase-interval") as "cov"
                from
                    (
                    select
                        "s"."patient-id" as "patient-id",
                        "s"."created-date" as "bill-date",
                        lead("s"."created-date", 1) over (partition by "s"."patient-id"
                    order by
                        "s"."created-date" desc nulls first) as "prev-bill-date",
                        "s"."created-date" - lead("s"."created-date", 1) over (partition by "s"."patient-id"
                    order by
                        "s"."created-date" desc nulls first) as "purchase-interval"
                    from
                        "prod2-generico"."sales" as "s"
                    where
                        "s"."bill-flag" = 'gross'
                        and "bill-date" <= current_date
                    group by
                        "s"."patient-id",
                        "s"."created-date"
                    order by
                        "s"."patient-id" asc nulls last,
                        "s"."created-date" asc nulls last) as "x1"
                group by
                    "x1"."patient-id") T2 on
                T1."patient-id" = T2."patient-id";"""

data = rs_db.get_df(data_q)
data = data.fillna(-1)

In [9]:
###################################################################
###################### Prediction #################################
###################################################################

# Use the loaded model to make predictions

data['treatment-prob'] = clf_test.predict_proba(data[data.columns[1:]])[:, 1]
data['non-treatment-prob'] = clf_control.predict_proba(data[data.columns[1:-1]])[:, 1]

data['treatment_pred'] = data['treatment-prob'] >= 0.19  # Harcode cutoff
data['non_treatment_pred'] = data['non-treatment-prob'] >= 0.26  # Hardcode cutoff

data['consumer-type'] = np.where((data['treatment_pred'] == True) & (data['non_treatment_pred'] == True), 'Sure Things',
                                 0)
data['consumer-type'] = np.where((data['treatment_pred'] == False) & (data['non_treatment_pred'] == False),
                                 'Lost Causes', data['consumer-type'])
data['consumer-type'] = np.where((data['treatment_pred'] == True) & (data['non_treatment_pred'] == False),
                                 'Persuadable', data['consumer-type'])
data['consumer-type'] = np.where((data['treatment_pred'] == False) & (data['non_treatment_pred'] == True),
                                 'Do Not Disturb', data['consumer-type'])

data_upload = data[['patient-id', 'consumer-type', 'treatment-prob', 'non-treatment-prob']]

# etl
data_upload['created-at'] = dt.now(tz=gettz('Asia/Kolkata')).strftime('%Y-%m-%d %H:%M:%S')
data_upload['created-by'] = 'etl-automation'
data_upload['updated-at'] = dt.now(tz=gettz('Asia/Kolkata')).strftime('%Y-%m-%d %H:%M:%S')
data_upload['updated-by'] = 'etl-automation'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [10]:
# Truncating Table

if isinstance(table_info, type(None)):
    logger.info(f"table: {table_name} do not exist")
else:
    truncate_query = f"""
            DELETE
            FROM
                "{read_schema}"."{table_name}";
                """

    logger.info(f"truncate query : \n {truncate_query}")
    rs_db.execute(truncate_query)

2022-12-29 18:07:36,928 - root - INFO - truncate query : 
 
            DELETE
            FROM
                "prod2-generico"."campaign-uplift";
                


In [11]:
# Write to csv
s3.save_df_to_s3(df=data_upload[table_info['column_name']],
                 file_name='campaign/campaing_uplift.csv')
s3.write_df_to_db(df=data_upload[table_info['column_name']], table_name=table_name, db=rs_db,
                  schema=read_schema)

S3 object(uri: s3://aws-prod-glue-assets-921939243643-ap-south-1/temp_1672317539999.csv) delete response: {'ResponseMetadata': {'RequestId': 'RJKWS34AAK1FNQN7', 'HostId': '4xwG9UW6ysLWVy2UWL6KktojO+2xlPDK2Djw1Uoz/BwSRU2jN2k+FSGurGJf966S+TEqbkGYhq+j/LuQzmXa6g==', 'HTTPStatusCode': 204, 'HTTPHeaders': {'x-amz-id-2': '4xwG9UW6ysLWVy2UWL6KktojO+2xlPDK2Djw1Uoz/BwSRU2jN2k+FSGurGJf966S+TEqbkGYhq+j/LuQzmXa6g==', 'x-amz-request-id': 'RJKWS34AAK1FNQN7', 'date': 'Thu, 29 Dec 2022 12:40:44 GMT', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}
