## Imports

1. Remove all transactions where there is confirm but no on_confirm, init but not on_init, select but not on_select. 
	* Ensure that you keep a dump that we can test and see why thats happening

2. If there are multiple selects happening for the same transaction ID and different Provider ID Keys, then take it as different Selects.

3. Time of day - take the select call timestamp

4. .Create  a sequence where Level 1= Select, 2= Init, 3 = Confirm. 
	* Lets then see the drop off rate Overall, BAP, SAP, Time of the day (make time intervals through the day). 
	* Is there a location?

5. Check the "Confirm" status for each day and match it against the confirmed orders for the day in Open Data. Then we will know whether the data is matching.

Main objective is to observe the dropoff and try to find out the factors for the same.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from dotenv import load_dotenv

In [2]:
from pyathena import connect
from pyathena.pandas.cursor import PandasCursor
import psycopg

In [3]:
from sqlalchemy import create_engine

In [4]:
load_dotenv(".env")

True

## Environent Variables

In [5]:
aws_schema = os.getenv('SCHEMA_NAME')
aws_tbl = os.getenv('TABLE_NAME')

aws_access_key = os.getenv('AWS_ACCESS_KEY')
aws_secret_key = os.getenv('AWS_SECRET_KEY')
aws_region = os.getenv('AWS_REGION')
aws_staging_dir = os.getenv('S3_STAGING_DIR')

aws_db = os.getenv('DATABASE_NAME')

In [6]:
tgt_user = os.getenv("POSTGRES_USER")
tgt_pwd = os.getenv("POSTGRES_PASSWORD")
tgt_host = os.getenv("POSTGRES_HOST")
tgt_port = os.getenv("POSTGRES_PORT") 

tgt_schema = os.getenv("POSTGRES_SCHEMA")
tgt_db = os.getenv("POSTGRES_DB")

In [7]:
tgt_user

'postgres'

## Setting up the connection

In [8]:
pg_conn = create_engine(f"postgresql+psycopg://{tgt_user}:{tgt_pwd}@{tgt_host}:{tgt_port}/{tgt_db}")

In [9]:
try:
    pandas_ath_cursor = connect(
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        s3_staging_dir=aws_staging_dir,
        region_name=aws_region,
        schema_name=aws_schema,
        cursor_class=PandasCursor).cursor()
except Exception as e:
    print(e.args[0])
else:
    print("Connected to Athena Database using pandas' connector.")

Connected to Athena Database using pandas' connector.


### DB Functions

In [10]:
def run_sql_athena(query: str, db_cursor, size: int=0):
	df = pd.DataFrame()
	try:
		print("Executing the query on AWS Athena.")
		if size != 0:
			df = db_cursor.execute(query).fetchmany(size).as_pandas()
		else:
			df = db_cursor.execute(query).as_pandas()
	except Exception as e:
		print(e.args[0])
		return
	else:
		print("Successfully executed the query.")

	return df

In [11]:
def write_to_pg(schema_name: str, table_name: str, db_conn:str, df_tgt: pd.DataFrame,chunk: int = 50000):
  try:
    df_tgt.to_sql(con=db_conn,
              chunksize=chunk,if_exists="replace", 
                            name=table_name, schema=schema_name)
  except Exception as e:
    print(e.args[0])
    return False
  else:
    print("Write Successful")
    return True

### SQL Queries

In [12]:
row_limiter = 25000

In [13]:
all_trs = f"""select distinct transaction_id, count(1) from {aws_db}.{aws_tbl}
group by transaction_id
order by count(1) desc;"""

In [14]:
# tr_funnel_all = f"""select * from {aws_db}.{aws_tbl} 
# order by transaction_id, select_timestamp;"""

In [15]:
tr_funnel_filtered = f"""select * from {aws_db}.{aws_tbl}
where transaction_id in (
select transaction_id from (
select distinct transaction_id, count(1) 
	from {aws_db}.{aws_tbl}
	group by transaction_id
	order by count(1) desc limit {row_limiter}
)) order by transaction_id, select_timestamp;"""

### Populating Data in SQL Table

#### Getting the data from AWS Athena.

In [14]:
df_all_trs = run_sql_athena(all_trs, pandas_ath_cursor)

Executing the query on AWS Athena.
Successfully executed the query.


In [17]:
# df_all_trs.to_parquet("all_trs.parquet")

In [18]:
# df_all_trs.shape

In [19]:
# df_tr_funnel_all = run_sql_athena(tr_funnel_filtered, pandas_ath_cursor)

In [20]:
# df_tr_funnel_all.to_parquet("funnel_all.parquet")

In [21]:
df_all_trs = pd.read_parquet("all_trs.parquet")

In [22]:
df_tr_funnel_all = pd.read_parquet("funnel_all.parquet")

#### Writing to Postgresql

In [23]:
# write_to_pg("tr_fun", "all_transactions",df_tgt= df_all_trs, db_conn=pg_conn)

In [24]:
# write_to_pg("tr_fun", "funnel_all",df_tgt= df_tr_funnel_all, db_conn=pg_conn)

## Analysis

In [15]:
unique_trs = df_tr_funnel_all["transaction_id"].unique()

NameError: name 'df_tr_funnel_all' is not defined

In [26]:
len(unique_trs)

25000

In [27]:
tr_types = df_tr_funnel_all["transaction_type"].unique()

In [28]:
tr_types # Get this from the database itself. 

array(['on_select', 'confirm', 'select', 'on_init', 'init', 'on_confirm'],
      dtype=object)

In [29]:
df = df_tr_funnel_all.copy()

In [30]:
df.head(5)

Unnamed: 0,bpp_id,bap_id,domain,transaction_type,transaction_id,num_count,select_timestamp,std_code,provider_name,provider_id
0,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,ONDC:RET11,on_select,0001e709-5ed3-436c-8f34-c04c27174ca5,8,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883
1,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,ONDC:RET11,confirm,0001e709-5ed3-436c-8f34-c04c27174ca5,1,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883
2,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,ONDC:RET11,select,0001e709-5ed3-436c-8f34-c04c27174ca5,8,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883
3,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,ONDC:RET11,on_init,0001e709-5ed3-436c-8f34-c04c27174ca5,2,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883
4,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,ONDC:RET11,init,0001e709-5ed3-436c-8f34-c04c27174ca5,2,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883


In [31]:
df.shape

(432099, 10)

In [32]:
df["segment_key"] = df["transaction_id"]+"__"+df["provider_id"]

In [33]:
id_cols = ["bpp_id", "bap_id","std_code","provider_name","provider_id", "transaction_id", "segment_key"]

In [34]:
for col in id_cols:
  print(col, " --> ", np.count_nonzero(df[col].unique()))

bpp_id  -->  21
bap_id  -->  6
std_code  -->  70
provider_name  -->  4865
provider_id  -->  12479
transaction_id  -->  25000
segment_key  -->  97519


In [35]:
df.drop(columns=["domain"], inplace=True)

In [36]:
df.head(5)

Unnamed: 0,bpp_id,bap_id,transaction_type,transaction_id,num_count,select_timestamp,std_code,provider_name,provider_id,segment_key
0,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,on_select,0001e709-5ed3-436c-8f34-c04c27174ca5,8,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883,0001e709-5ed3-436c-8f34-c04c27174ca5__46315883
1,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,confirm,0001e709-5ed3-436c-8f34-c04c27174ca5,1,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883,0001e709-5ed3-436c-8f34-c04c27174ca5__46315883
2,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,select,0001e709-5ed3-436c-8f34-c04c27174ca5,8,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883,0001e709-5ed3-436c-8f34-c04c27174ca5__46315883
3,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,on_init,0001e709-5ed3-436c-8f34-c04c27174ca5,2,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883,0001e709-5ed3-436c-8f34-c04c27174ca5__46315883
4,webapi.magicpin.in/oms_partner/ondc,ondc-bap.olacabs.com,init,0001e709-5ed3-436c-8f34-c04c27174ca5,2,2024-05-29T22:38:52Z,std:080,Wow! Momo,46315883,0001e709-5ed3-436c-8f34-c04c27174ca5__46315883


In [37]:
df_crosstab_segment = pd.crosstab(columns=df["transaction_type"], index=df["segment_key"])
df_crosstab_segment = df_crosstab_segment.reset_index().rename_axis(None, axis=1)

In [38]:
df_crosstab_segment.head(5)

Unnamed: 0,segment_key,confirm,init,on_confirm,on_init,on_select,select
0,0001e709-5ed3-436c-8f34-c04c27174ca5__30647898,1,1,1,1,1,1
1,0001e709-5ed3-436c-8f34-c04c27174ca5__46315883,1,1,1,1,1,1
2,000471a5-ac92-42f8-9239-19575ed2e6d5__1462330,1,1,1,1,1,1
3,000471a5-ac92-42f8-9239-19575ed2e6d5__46107284,1,1,1,1,1,1
4,000471a5-ac92-42f8-9239-19575ed2e6d5__96296,1,1,1,1,1,1


Transactions where confirm doesn't match with on-confirm

In [97]:
def pair_mismatch(tgt_df: pd.DataFrame, src: str, tgt: str, srch_col:str = "transaction_id") -> list[str]:
  """
  Returns a list of Transaction IDs where there is a pair mismatch for the given pair. 
  df = Pandas Dataframe. This should be a crosstab format dataframe. 
  src = Source Column. 
  tgt = Target Column.
  srch_col = Column to return. Default is transaction_id.
    """
  list_missing = list(tgt_df[tgt_df[src] != tgt_df[tgt]][srch_col])
  return list_missing

In [40]:
def mulitple_calls(tgt_df: pd.DataFrame, col_to_srch: str, srch_col: str = "transaction_id", occur: int = 1) -> list[str]:
  """
  Returns a list of Transaction IDs if they have multiple instances.   
  df = Pandas Dataframe. 
  col_to_srch = The column to perform the search operation in.
  srch_col = The column to return. The default value is transaction_id. 
  occur = The number of occurances to search. Default value is 1. This will always search '> occur' instances. 
    """
  return list(tgt_df[tgt_df[col_to_srch] > occur][srch_col])

Selecting Mismatch by Transaction_id

Selecting Mismatch by Segment 

In [41]:
seg_select_mismatch = pair_mismatch(df_crosstab_segment, "select", "on_select", "segment_key")
seg_init_mismatch = pair_mismatch(df_crosstab_segment, "init", "on_init", "segment_key")
seg_confirm_mismatch = pair_mismatch(df_crosstab_segment, "confirm", "on_confirm", "segment_key")

In [42]:
seg_init_mismatch[:2]

['097e2056-5c48-4485-b6eb-e5d26bae47d8__39443',
 '097e2056-5c48-4485-b6eb-e5d26bae47d8__40009']

## Removing all mismatches 

In [47]:
df_mismatch = df_crosstab_segment[df_crosstab_segment["segment_key"].isin(seg_select_mismatch)]

In [48]:
df_mismatch["err_col"] = "select"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mismatch["err_col"] = "select"


In [49]:
df_mismatch = pd.concat([df_mismatch, df_crosstab_segment[df_crosstab_segment["segment_key"].isin(seg_init_mismatch)]])

In [50]:
df_mismatch.loc[:,"err_col"].fillna(value="init", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mismatch.loc[:,"err_col"].fillna(value="init", inplace=True)


In [51]:
df_mismatch = pd.concat([df_mismatch, df_crosstab_segment[df_crosstab_segment["segment_key"].isin(seg_confirm_mismatch)]])

In [52]:
df_mismatch.loc[:,"err_col"].fillna(value="confirm", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_mismatch.loc[:,"err_col"].fillna(value="confirm", inplace=True)


In [53]:
df_mismatch.reset_index(drop=True, inplace=True)

In [54]:
df_mismatch["transaction_id"] = df_mismatch["segment_key"].str.split("__",expand=True)[0]

In [55]:
df_mismatch["provider_id"] =df_mismatch["segment_key"].str.split("__",expand=True)[1]

In [56]:
# df_mismatch.drop(columns="segment_key", inplace=True)

In [57]:
df_mismatch.head(5)

Unnamed: 0,segment_key,confirm,init,on_confirm,on_init,on_select,select,err_col,transaction_id,provider_id
0,02027099-48e0-4626-90ac-db8c892cfa62__65982,0,0,0,0,0,1,select,02027099-48e0-4626-90ac-db8c892cfa62,65982
1,03fed2b3-1bc6-4aba-8f18-900160feb9ab__63892,0,0,0,0,0,1,select,03fed2b3-1bc6-4aba-8f18-900160feb9ab,63892
2,0643de86-8d63-4f48-a6a0-1df411796889__GFFBRTBI...,0,0,0,0,0,1,select,0643de86-8d63-4f48-a6a0-1df411796889,GFFBRTBI1692777668
3,0643de86-8d63-4f48-a6a0-1df411796889__GFFBRTBI...,0,0,0,0,0,1,select,0643de86-8d63-4f48-a6a0-1df411796889,GFFBRTBI1692777699
4,0643de86-8d63-4f48-a6a0-1df411796889__GFFBRTHY...,0,0,0,0,0,1,select,0643de86-8d63-4f48-a6a0-1df411796889,GFFBRTHY1695967001


In [58]:
df_mismatch_final = df_mismatch[["provider_id", "transaction_id", "err_col"]]

In [59]:
df_mismatch_final.head(5)

Unnamed: 0,provider_id,transaction_id,err_col
0,65982,02027099-48e0-4626-90ac-db8c892cfa62,select
1,63892,03fed2b3-1bc6-4aba-8f18-900160feb9ab,select
2,GFFBRTBI1692777668,0643de86-8d63-4f48-a6a0-1df411796889,select
3,GFFBRTBI1692777699,0643de86-8d63-4f48-a6a0-1df411796889,select
4,GFFBRTHY1695967001,0643de86-8d63-4f48-a6a0-1df411796889,select


In [None]:
# write_to_pg("tr_fun", "error_data", pg_conn, df_mismatch)

## Error Data Removed

In [60]:
err_seg_ids = df_mismatch["segment_key"].unique()

In [61]:
np.count_nonzero(df_mismatch["segment_key"].unique())

195

In [62]:
df_crosstab_segment.shape[0] - np.count_nonzero(df_mismatch["segment_key"].unique())

97323

In [63]:
np.count_nonzero(df_crosstab_segment["segment_key"].unique())

97518

In [64]:
df_final = df_crosstab_segment[~df_crosstab_segment["segment_key"].isin(err_seg_ids)]

In [65]:
df_final.reset_index(drop=True)

Unnamed: 0,segment_key,confirm,init,on_confirm,on_init,on_select,select
0,0001e709-5ed3-436c-8f34-c04c27174ca5__30647898,1,1,1,1,1,1
1,0001e709-5ed3-436c-8f34-c04c27174ca5__46315883,1,1,1,1,1,1
2,000471a5-ac92-42f8-9239-19575ed2e6d5__1462330,1,1,1,1,1,1
3,000471a5-ac92-42f8-9239-19575ed2e6d5__46107284,1,1,1,1,1,1
4,000471a5-ac92-42f8-9239-19575ed2e6d5__96296,1,1,1,1,1,1
...,...,...,...,...,...,...,...
97318,fffe67f2-a60d-4201-bf75-9a97ab133c22__26992504,1,1,1,1,1,1
97319,fffe67f2-a60d-4201-bf75-9a97ab133c22__34476436,1,1,1,1,1,1
97320,fffe67f2-a60d-4201-bf75-9a97ab133c22__40773335,1,1,1,1,1,1
97321,fffe67f2-a60d-4201-bf75-9a97ab133c22__46570658,1,1,1,1,1,1


### Funnel Analysis on Final Dataframe

remove the transaction_id of df_mismatch from df

In [75]:
rem_tr_id = df_mismatch["transaction_id"].unique()

In [76]:
df_funnel = df[~df["transaction_id"].isin(rem_tr_id)]

In [78]:
df_funnel.reset_index(drop=True, inplace=True)

In [103]:
df_funnel.to_parquet("clean_data_funnel.parquet")

In [101]:
df_funnel_crosstab = pd.crosstab(columns=df_funnel["transaction_type"], index=df_funnel["segment_key"]).reset_index().rename_axis(None, axis=1)

In [104]:
df_funnel_crosstab.to_parquet("clean_data_funnel_crosstab.parquet")