## KPI Mock data creation and SQL Logic v5 #2 (2.3, 2.5, 2.6)

### Create Mock Data

Daily Batch Feed 25
Service Now CRM	S3	request for information	daily feed of all cases where the agent has requested further information from the customer

Includes the following columns:
- case_id
- created_timestamp
- case_type
- request_typ
- blocked_timestamp
- blocked_sequence

In [3]:
# Mock data for Batch #25 - Request For Information

import pandas as pd
import numpy as np
from datetime import datetime


### Create Batch 25
# Set conditions
np.random.seed(0)
batch25_output_filename = 'servicenow_requestforinformation.parquet'
num_samples = 999

length_min_days = 1
length_max_days = 6
case_types_list = ['Sales Application','Postal Payment']
request_types_list = ['Online','Mobile','ID&V']
blocked_sequence_list = [1,2]

case_ids = ['CAS' + f'{i:03}' for i in range(1, num_samples + 1)]

start_date = pd.to_datetime('2024-01-01')
end_date = pd.to_datetime(datetime.now().strftime('%Y-%m-%d')) 
timestamps = pd.to_datetime(np.random.randint(start_date.value, end_date.value, num_samples), unit='ns')

df = pd.DataFrame({
    'case_id': case_ids,
    'created_timestamp': timestamps
})

df['case_type'] = np.random.choice(case_types_list, size=len(df))
df['request_type'] = np.random.choice(request_types_list, size=len(df))
df['blocked_sequence'] = np.random.choice(blocked_sequence_list, size=len(df)).astype('int32')
df['created_timestamp'] = pd.to_datetime(df['created_timestamp']).dt.round('s')


random_days = pd.to_timedelta(np.random.randint(length_min_days, length_max_days, size=len(df)), unit='D')
df['blocked_timestamp'] = df['created_timestamp'] + random_days
df['blocked_timestamp'] = pd.to_datetime(df['blocked_timestamp']).dt.round('s')

df['created_timestamp'] = df['created_timestamp'].astype(str)
df['blocked_timestamp'] = df['blocked_timestamp'].astype(str)


df.to_parquet(batch25_output_filename, engine='pyarrow', index=False)

print(df.head())
print(f"Data saved to '{batch25_output_filename}'.")
print(df.dtypes)
print('------------------------')




  case_id    created_timestamp          case_type request_type  \
0  CAS001  2024-03-14 18:57:55     Postal Payment         ID&V   
1  CAS002  2024-02-17 21:22:34     Postal Payment       Mobile   
2  CAS003  2024-03-23 11:27:07  Sales Application       Online   
3  CAS004  2024-01-19 14:52:21  Sales Application       Online   
4  CAS005  2024-02-06 14:28:16  Sales Application       Online   

   blocked_sequence    blocked_timestamp  
0                 2  2024-03-17 18:57:55  
1                 1  2024-02-19 21:22:34  
2                 2  2024-03-24 11:27:07  
3                 1  2024-01-20 14:52:21  
4                 1  2024-02-07 14:28:16  
Data saved to 'servicenow_requestforinformation.parquet'.
case_id              object
created_timestamp    object
case_type            object
request_type         object
blocked_sequence      int32
blocked_timestamp    object
dtype: object
------------------------


### SQL Logic for KPI 2.3

2.3	Batch	25	\
for today-30 to today \
where case_type = 'Sales Application' and blocked_sequence = '1' and request_type <> 'ID&V' \
s1. count records \
s2. count records where request_timestamp - case_created_timestamp <= 4 days \
s3. s2/s1*100 

In [16]:
import pandasql as ps
import pandas as pd

batch25_df = pd.read_parquet(batch25_output_filename)

sql_query = """
WITH selected_df AS (
    SELECT *
    ,julianday(blocked_timestamp) - julianday(created_timestamp) AS days_to_process
    FROM (
        SELECT *
        FROM batch25_df
        WHERE case_type = 'Sales Application'
        AND blocked_sequence = 1
        AND request_type <> 'ID&V'
        AND (blocked_timestamp BETWEEN DATETIME('NOW','-30 days') AND DATETIME('NOW'))
    )
)
,
agg_results AS (
SELECT COUNT(*) AS Total
,SUM(CASE WHEN days_to_process <= 4 THEN 1 ELSE 0 END) AS time_to_process_lte_4_days
FROM selected_df
)

SELECT (time_to_process_lte_4_days * 1.0 / Total) * 100 AS PC_LTE_4_days
FROM agg_results



"""

print(ps.sqldf(sql_query,locals()))

   PC_LTE_4_days
0           86.0


In [None]:
# SQL logic used in ETL_KPI_data_KPI2.3 in AWS Glue
# Note, some functions have been changed to accomdate the change from SQLite to Postgres

WITH current_date_id AS (
    SELECT myDimDate.pk_id
    FROM myDimDate
    WHERE myDimDate.date = CURRENT_DATE
)
,
current_kpi_id AS (
    SELECT pk_id
    FROM myDimKPI
    WHERE kpi_reference = '2.3'
)
,
selected_df AS (
    SELECT *
    ,EXTRACT(DAY FROM (blocked_timestamp - created_timestamp)) AS days_to_process
    FROM (
        SELECT *
        FROM df
        WHERE case_type = 'Sales Application'
        AND blocked_sequence = 1
        AND request_type <> 'ID&V'
        AND (blocked_timestamp BETWEEN (CURRENT_DATE-31) AND (CURRENT_DATE-1))
    )
)
,
agg_results AS (
SELECT COUNT(*) AS Total
,SUM(CASE WHEN days_to_process <= 4 THEN 1 ELSE 0 END) AS time_to_process_lte_4_days
FROM selected_df
)

SELECT (SELECT pk_id FROM current_date_id) AS fk_date_id
,(SELECT pk_id FROM current_kpi_id) AS fk_kpi_id
,(time_to_process_lte_4_days * 100.0 / Total) AS Value
FROM agg_results

### SQL Logic for KPI 2.5

2.5	Batch	25	\
for today-30 to today \
where case_type = 'Postal Payment' and blocked_sequence = '1' \
s1. count records \
s2. count records where request_timestamp - case_created_timestamp <= 3 days \
s3. s2/s1*100

In [17]:
import pandasql as ps
import pandas as pd

batch25_df = pd.read_parquet(batch25_output_filename)

sql_query = """
WITH selected_df AS (
    SELECT *
    ,julianday(blocked_timestamp) - julianday(created_timestamp) AS days_to_process
    FROM (
        SELECT *
        FROM batch25_df
        WHERE case_type = 'Postal Payment'
        AND blocked_sequence = 1
        AND (blocked_timestamp BETWEEN DATETIME('NOW','-30 days') AND DATETIME('NOW'))
    )
)
,
agg_results AS (
SELECT COUNT(*) AS Total
,SUM(CASE WHEN days_to_process <= 3 THEN 1 ELSE 0 END) AS time_to_process_lte_3_days
FROM selected_df
)

SELECT (time_to_process_lte_3_days * 1.0 / Total) * 100 AS PC_LTE_3_days
FROM agg_results



"""

print(ps.sqldf(sql_query,locals()))

   PC_LTE_3_days
0      58.823529


In [None]:
# SQL logic used in ETL_KPI_data_KPI2.3 in AWS Glue
# Note, some functions have been changed to accomdate the change from SQLite to Postgres

WITH current_date_id AS (
    SELECT myDimDate.pk_id
    FROM myDimDate
    WHERE myDimDate.date = CURRENT_DATE
)
,
current_kpi_id AS (
    SELECT pk_id
    FROM myDimKPI
    WHERE kpi_reference = '2.5'
)
,
selected_df AS (
    SELECT *
    ,EXTRACT(DAY FROM (blocked_timestamp - created_timestamp)) AS days_to_process
    FROM (
        SELECT *
        FROM df
        WHERE case_type = 'Postal Payment'
        AND blocked_sequence = 1
        AND (blocked_timestamp BETWEEN (CURRENT_DATE-31) AND (CURRENT_DATE-1))
    )
)
,
agg_results AS (
SELECT COUNT(*) AS Total
,SUM(CASE WHEN days_to_process <= 3 THEN 1 ELSE 0 END) AS time_to_process_lte_3_days
FROM selected_df
)

SELECT (SELECT pk_id FROM current_date_id) AS fk_date_id
,(SELECT pk_id FROM current_kpi_id) AS fk_kpi_id
,(time_to_process_lte_3_days * 100.0 / Total) AS Value
FROM agg_results

### SQL Logic for KPI 2.6

2.6	Batch	25	\
for today-30 to today \
where case_type = 'Sales Application' and blocked_sequence = '1' and request_type = 'ID&V' \
s1. count records \
s2. count records where request_timestamp - case_created_timestamp <= 2 days \
s3. s2/s1*100

In [18]:
import pandasql as ps
import pandas as pd

batch25_df = pd.read_parquet(batch25_output_filename)

sql_query = """
WITH selected_df AS (
    SELECT *
    ,julianday(blocked_timestamp) - julianday(created_timestamp) AS days_to_process
    FROM (
        SELECT *
        FROM batch25_df
        WHERE case_type = 'Sales Application'
        AND blocked_sequence = 1
        AND request_type = 'ID&V'
        AND (blocked_timestamp BETWEEN DATETIME('NOW','-30 days') AND DATETIME('NOW'))
    )
)
,
agg_results AS (
SELECT COUNT(*) AS Total
,SUM(CASE WHEN days_to_process <= 2 THEN 1 ELSE 0 END) AS time_to_process_lte_2_days
FROM selected_df
)

SELECT (time_to_process_lte_2_days * 1.0 / Total) * 100 AS PC_LTE_2_days
FROM agg_results



"""

print(ps.sqldf(sql_query,locals()))

   PC_LTE_2_days
0      39.130435


In [None]:
# SQL logic used in ETL_KPI_data_KPI2.3 in AWS Glue
# Note, some functions have been changed to accomdate the change from SQLite to Postgres

WITH current_date_id AS (
    SELECT myDimDate.pk_id
    FROM myDimDate
    WHERE myDimDate.date = CURRENT_DATE
)
,
current_kpi_id AS (
    SELECT pk_id
    FROM myDimKPI
    WHERE kpi_reference = '2.6'
)
,
selected_df AS (
    SELECT *
    ,EXTRACT(DAY FROM (blocked_timestamp - created_timestamp)) AS days_to_process
    FROM (
        SELECT *
        FROM df
        WHERE case_type = 'Sales Application'
        AND blocked_sequence = 1
        AND request_type = 'ID&V'
        AND (blocked_timestamp BETWEEN (CURRENT_DATE-31) AND (CURRENT_DATE-1))
    )
)
,
agg_results AS (
SELECT COUNT(*) AS Total
,SUM(CASE WHEN days_to_process <= 2 THEN 1 ELSE 0 END) AS time_to_process_lte_2_days
FROM selected_df
)

SELECT (SELECT pk_id FROM current_date_id) AS fk_date_id
,(SELECT pk_id FROM current_kpi_id) AS fk_kpi_id
,(time_to_process_lte_2_days * 100.0 / Total) AS Value
FROM agg_results