In [1]:
import pandas as pd

In [2]:
import pyarrow.parquet as pq
from sqlalchemy import Date as sadt
from io import StringIO
from glob import glob

In [3]:
def convertDateColumns(df:pd.DataFrame) -> pd.DataFrame:
    if "curr_date" in df.columns:
        df.loc[:,["curr_date"]] = [x.date() for x in pd.to_datetime(df["curr_date"], format="%Y-%m-%d")]
    if "ord_date" in df.columns:
        df.loc[:,["ord_date"]] = [x.date() for x in pd.to_datetime(df["ord_date"], format="%Y-%m-%d")]
    if "order_date" in df.columns:
        df.loc[:,["order_date"]] = [x.date() for x in pd.to_datetime(df["order_date"], format="%Y-%m-%d")]
    if "null_domain" in df.columns:
        df = df.drop(columns="null_domain")
    for col in df.columns:
        if col not in ["curr_date", "ord_date", "seller_np",
                       "order_date", "order_status", "cancellation_code"]:
            if df[col].dtype != 'int64':
                df[col] = df[col].astype(int)
    return df

In [4]:
files_loc = "D:\\DATA_DUMP\\DATA_QUALITY\\DATA_QUALITY_2024-10-22\\"

In [5]:
main_files = glob(files_loc+"/*dq_main*")
dim_sellers_files = glob(files_loc+"/*dim_sellers*")
dim_ord_stat_files = glob(files_loc+"/*order_status*")

In [6]:
dim_ord_stat_files

['D:\\DATA_DUMP\\DATA_QUALITY\\DATA_QUALITY_2024-10-22\\dim_order_status_2024-05-01.parquet',
 'D:\\DATA_DUMP\\DATA_QUALITY\\DATA_QUALITY_2024-10-22\\dim_order_status_2024-06-01.parquet',
 'D:\\DATA_DUMP\\DATA_QUALITY\\DATA_QUALITY_2024-10-22\\dim_order_status_2024-07-01.parquet',
 'D:\\DATA_DUMP\\DATA_QUALITY\\DATA_QUALITY_2024-10-22\\dim_order_status_2024-08-01.parquet',
 'D:\\DATA_DUMP\\DATA_QUALITY\\DATA_QUALITY_2024-10-22\\dim_order_status_2024-09-01.parquet',
 'D:\\DATA_DUMP\\DATA_QUALITY\\DATA_QUALITY_2024-10-22\\dim_order_status_2024-10-01.parquet']

In [7]:
conn_str = "postgresql+psycopg://postgres:Password123@192.168.1.10:5432/db_stage"

### Aggregated View

In [8]:
df_dq_main = pd.read_parquet(main_files)
df_dim_sellers = pd.read_parquet(dim_sellers_files)
df_ord_status = pd.read_parquet(dim_ord_stat_files)

In [9]:
df_agg_view = df_dq_main.groupby(['ord_date', 'seller_np']).agg(
	total_orders=('total_orders', 'sum'),
	total_canceled_orders=('total_canceled_orders', 'sum'),
	null_fulfilment_id=('null_fulfilment_id', 'sum'),
	null_net_tran_id=('null_net_tran_id', 'sum'),
	null_qty=('null_qty', 'sum'),
	null_itm_fulfilment_id=('null_itm_fulfilment_id', 'sum'),
	null_del_pc=('null_del_pc', 'sum'),
	null_created_date_time=('null_created_date_time', 'sum'),
	null_del_cty=('null_del_cty', 'sum'),
	null_cans_code=('null_cans_code', 'sum'),
	null_cans_dt_time=('null_cans_dt_time', 'sum'),
	null_ord_stats=('null_ord_stats', 'sum'),
	null_fulfil_status=('null_fulfil_status', 'sum'),
	null_itm_cat=('null_itm_cat', 'sum'),
	null_cat_cons=('null_cat_cons', 'sum'),
	null_sell_pincode=('null_sell_pincode', 'sum'),
	null_prov_id=('null_prov_id', 'sum'),
	null_itm_id=('null_itm_id', 'sum'),
	null_sell_np=('null_sell_np', 'sum'),
	null_net_ord_id=('null_net_ord_id', 'sum'),
	null_sell_cty=('null_sell_cty', 'sum')
).reset_index()


In [10]:
df_agg_view = convertDateColumns(df_agg_view)

In [11]:
df_agg_view.head(5)

Unnamed: 0,ord_date,seller_np,total_orders,total_canceled_orders,null_fulfilment_id,null_net_tran_id,null_qty,null_itm_fulfilment_id,null_del_pc,null_created_date_time,...,null_ord_stats,null_fulfil_status,null_itm_cat,null_cat_cons,null_sell_pincode,null_prov_id,null_itm_id,null_sell_np,null_net_ord_id,null_sell_cty
0,2024-05-01,agg.dominos.co.in,1595,20,0,0,0,0,0,0,...,0,0,12,12,1,0,0,0,0,1
1,2024-05-01,agrimart.api.agrevolution.in,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2024-05-01,api-ondc.dlyb.in,32,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2024-05-01,api.eksecond.in,432,109,0,0,0,0,0,0,...,0,0,47,47,0,0,0,0,0,0
4,2024-05-01,api.esamudaay.com/ondc/sdk/bpp/retail/bzspr,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_agg_view.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8451 entries, 0 to 8450
Data columns (total 23 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ord_date                8451 non-null   object
 1   seller_np               8451 non-null   object
 2   total_orders            8451 non-null   int64 
 3   total_canceled_orders   8451 non-null   int64 
 4   null_fulfilment_id      8451 non-null   int64 
 5   null_net_tran_id        8451 non-null   int64 
 6   null_qty                8451 non-null   int64 
 7   null_itm_fulfilment_id  8451 non-null   int64 
 8   null_del_pc             8451 non-null   int64 
 9   null_created_date_time  8451 non-null   int64 
 10  null_del_cty            8451 non-null   int64 
 11  null_cans_code          8451 non-null   int64 
 12  null_cans_dt_time       8451 non-null   int64 
 13  null_ord_stats          8451 non-null   int64 
 14  null_fulfil_status      8451 non-null   int64 
 15  null

In [13]:
df_agg_view.to_sql(
	name="aggregated_view",
	con=conn_str,
	index=False,
	if_exists='append',
	schema="data_sanity"
)

-1

### Aggregated Sum

In [14]:
df_agg_sum = df_agg_view[["ord_date","seller_np","total_orders","total_canceled_orders"]]

In [15]:
df_agg_sum.loc[:,['canc_metrices']] = df_agg_view['null_cans_code'] + df_agg_view['null_cans_dt_time']

In [16]:
df_agg_sum.loc[:,['completed_orders']] = df_agg_view['total_orders'] - df_agg_view['total_canceled_orders']
df_agg_sum.loc[:,['sum_missing_cols']] = (
		df_agg_view['null_fulfilment_id'] +
		df_agg_view['null_net_tran_id'] +
		df_agg_view['null_qty'] +
		df_agg_view['null_itm_fulfilment_id'] +
		df_agg_view['null_del_pc'] +
		df_agg_view['null_created_date_time'] +
		df_agg_view['null_del_cty'] +
		df_agg_view['null_ord_stats'] +
		df_agg_view['null_fulfil_status'] +
		df_agg_view['null_itm_cat'] +
		df_agg_view['null_cat_cons'] +
		df_agg_view['null_sell_pincode'] +
		df_agg_view['null_prov_id'] +
		df_agg_view['null_itm_id'] +
		df_agg_view['null_sell_np'] +
		df_agg_view['null_net_ord_id'] +
		df_agg_view['null_sell_cty']
    )

In [17]:
df_agg_sum.head(5)

Unnamed: 0,ord_date,seller_np,total_orders,total_canceled_orders,canc_metrices,completed_orders,sum_missing_cols
0,2024-05-01,agg.dominos.co.in,1595,20,9,1575,42
1,2024-05-01,agrimart.api.agrevolution.in,1,0,0,1,0
2,2024-05-01,api-ondc.dlyb.in,32,0,0,32,30
3,2024-05-01,api.eksecond.in,432,109,0,323,366
4,2024-05-01,api.esamudaay.com/ondc/sdk/bpp/retail/bzspr,6,0,0,6,0


In [18]:
df_agg_sum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8451 entries, 0 to 8450
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ord_date               8451 non-null   object
 1   seller_np              8451 non-null   object
 2   total_orders           8451 non-null   int64 
 3   total_canceled_orders  8451 non-null   int64 
 4   canc_metrices          8451 non-null   int64 
 5   completed_orders       8451 non-null   int64 
 6   sum_missing_cols       8451 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 462.3+ KB


In [19]:
df_agg_sum["total_orders"].dtype == 'int64'

True

In [21]:
df_agg_sum.to_sql(
	name="aggregated_sum",
	con=conn_str,
	index=False,
	if_exists='append',
	schema="data_sanity"
)

-1

### Aggregated Order Status

In [28]:
df_agg_ord_stats = df_ord_status.groupby(['order_date', 'order_status', 'seller_np']).agg(count=('order_status', 'count')).reset_index()

In [29]:
df_agg_ord_stats = df_agg_ord_stats.sort_values(by=['order_date', 'seller_np'], ascending=[False, True], ignore_index=True)
df_agg_ord_stats = convertDateColumns(df_agg_ord_stats)

In [30]:
df_agg_ord_stats.head(5)

Unnamed: 0,order_date,order_status,seller_np,count
0,2024-10-21,Accepted,agg.dominos.co.in,1
1,2024-10-21,Cancelled,agg.dominos.co.in,2
2,2024-10-21,Completed,agg.dominos.co.in,1
3,2024-10-21,Created,agg.dominos.co.in,1
4,2024-10-21,In-progress,agg.dominos.co.in,1


In [31]:
df_agg_ord_stats.to_sql(
	name="agg_order_stats",
	con=conn_str,
	index=False,
	if_exists='append',
	schema="data_sanity"
)

-1

### Column Sum

In [34]:
df_agg_view

Unnamed: 0,ord_date,seller_np,total_orders,total_canceled_orders,null_fulfilment_id,null_net_tran_id,null_qty,null_itm_fulfilment_id,null_del_pc,null_created_date_time,...,null_ord_stats,null_fulfil_status,null_itm_cat,null_cat_cons,null_sell_pincode,null_prov_id,null_itm_id,null_sell_np,null_net_ord_id,null_sell_cty
0,2024-05-01,agg.dominos.co.in,1595,20,0,0,0,0,0,0,...,0,0,12,12,1,0,0,0,0,1
1,2024-05-01,agrimart.api.agrevolution.in,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2024-05-01,api-ondc.dlyb.in,32,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2024-05-01,api.eksecond.in,432,109,0,0,0,0,0,0,...,0,0,47,47,0,0,0,0,0,0
4,2024-05-01,api.esamudaay.com/ondc/sdk/bpp/retail/bzspr,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8446,2024-10-21,seller.tipplr.in,1616,68,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8447,2024-10-21,sellerappapi.ninjacart.in,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8448,2024-10-21,shikhar-ondc.hulcd.com,94,6,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8449,2024-10-21,store1.samhita.org,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
df_agg_col_sum = df_agg_view.groupby(['ord_date', 'seller_np']).agg(
        total_orders=('total_orders', 'sum'),
        total_canceled_orders=('total_canceled_orders', 'sum'),
        null_fulfilment_id=('null_fulfilment_id', 'sum'),
        null_net_tran_id=('null_net_tran_id', 'sum'),
        null_qty=('null_qty', 'sum'),
        null_itm_fulfilment_id=('null_itm_fulfilment_id', 'sum'),
        null_del_pc=('null_del_pc', 'sum'),
        null_cans_code=('null_cans_code', 'sum'),
        null_cans_dt_time=('null_cans_dt_time', 'sum'),
        null_ord_stats=('null_ord_stats', 'sum'),
        null_fulfil_status=('null_fulfil_status', 'sum'),
        null_itm_cat=('null_itm_cat', 'sum'),
        null_cat_cons=('null_cat_cons', 'sum'),
        null_sell_pincode=('null_sell_pincode', 'sum'),
        null_prov_id=('null_prov_id', 'sum'),
        null_itm_id=('null_itm_id', 'sum'),
        null_sell_np=('null_sell_np', 'sum'),
        null_net_ord_id=('null_net_ord_id', 'sum'),
        null_sell_cty=('null_sell_cty', 'sum'),
        null_del_cty=('null_del_cty', 'sum')
    ).reset_index()

In [36]:
df_agg_col_sum

Unnamed: 0,ord_date,seller_np,total_orders,total_canceled_orders,null_fulfilment_id,null_net_tran_id,null_qty,null_itm_fulfilment_id,null_del_pc,null_cans_code,...,null_fulfil_status,null_itm_cat,null_cat_cons,null_sell_pincode,null_prov_id,null_itm_id,null_sell_np,null_net_ord_id,null_sell_cty,null_del_cty
0,2024-05-01,agg.dominos.co.in,1595,20,0,0,0,0,0,9,...,0,12,12,1,0,0,0,0,1,16
1,2024-05-01,agrimart.api.agrevolution.in,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2024-05-01,api-ondc.dlyb.in,32,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30
3,2024-05-01,api.eksecond.in,432,109,0,0,0,0,0,0,...,0,47,47,0,0,0,0,0,0,272
4,2024-05-01,api.esamudaay.com/ondc/sdk/bpp/retail/bzspr,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8446,2024-10-21,seller.tipplr.in,1616,68,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
8447,2024-10-21,sellerappapi.ninjacart.in,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
8448,2024-10-21,shikhar-ondc.hulcd.com,94,6,0,0,0,0,0,6,...,0,0,0,0,0,0,0,0,0,79
8449,2024-10-21,store1.samhita.org,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
df_agg_col_sum = df_agg_col_sum.sort_values(by=['ord_date', 'seller_np'], ascending=[False, True])
df_agg_col_sum = convertDateColumns(df_agg_col_sum)


In [39]:
df_agg_col_sum.to_sql(
	name="col_sum",
	con=conn_str,
	index=False,
	if_exists='append',
	schema="data_sanity"
)

-1