In [None]:
from dotenv import load_dotenv
import os
from glob import glob
import pandas as pd

In [None]:
import env_defs as ed

In [None]:
import utils

In [None]:
b2b_files = glob(f'{ed.dump_loc}/*shared_open_data_b2b_order*')
logistic_files = glob(f'{ed.dump_loc}/*shared_open_data_logistics_order*')
retail_files = glob(f'{ed.dump_loc}/*nhm_order_fulfillment_subset_v1*')
voucher_files = glob(f'{ed.dump_loc}/*shared_open_data_gift_voucher_order*')

In [None]:
# pc_tbl = pd.read_parquet(ed.script_loc+"pc.parquet")

In [None]:
# import ETL_Loader

In [None]:
# await ETL_Loader.insert_into_pincode(ETL_Loader.pg_url)

### Catalogue the Files

In [None]:
logistics_files = glob(f'{ed.raw_files}/*shared_open_data_logistics_order*')
b2b_files = glob(f'{ed.raw_files}/*shared_open_data_b2b_order*')
b2c_files = glob(f'{ed.raw_files}/*nhm_order_fulfillment_subset_v1*')    
voucher_files = glob(f'{ed.raw_files}/*shared_open_data_gift_voucher_order*')

### Check Transform Data

In [None]:
import asyncio
import os
import pandas as pd
from glob import glob
from dotenv import load_dotenv
import sys
import env_defs as ed
import utils

In [None]:
pc_tbl = pd.read_parquet(ed.script_loc+"pc.parquet")

In [None]:
pc_tbl.shape

In [None]:
logistics_files = glob(f'{ed.raw_files}/*shared_open_data_logistics_order*')
b2b_files = glob(f'{ed.raw_files}/*shared_open_data_b2b_order*')
b2c_files = glob(f'{ed.raw_files}/*nhm_order_fulfillment_subset_v1*')    
voucher_files = glob(f'{ed.raw_files}/*shared_open_data_gift_voucher_order*')

val_dict = {"logistics":logistics_files, 
            "retail_b2b":b2b_files, 
            "retail_b2c":b2c_files, 
            "voucher":voucher_files}

In [None]:
def process_logistics(tgt_file: str, file_category:str):
    """
    tgt_file: The File to process in Parquet format.
    Pincode file Dataframe will be a global variable.
    file_dump_loc: Where it should be written. 
    
    """
    try:
        tgt_df = pd.read_parquet(tgt_file)
    except:
        print("Parquet files only.")
        return pd.DataFrame()
    
    dt_val = tgt_file.split("query_result_")[1].split("_")[0]
    row_count = tgt_df.shape[0]
    
    if row_count < 1:
        print(f"Empty Dataframe {tgt_df}")
        return pd.DataFrame()
    else:
        print(f"Proceeding with {tgt_file}")
    
    print("Truncating the Pincode columns")
    # tgt_df["pick_up_pincode"] = tgt_df["pick_up_pincode"].str.strip()
    # tgt_df["delivery_pincode"] = tgt_df["delivery_pincode"].str.strip()
    
    # tgt_df["pick_up_pincode"] = tgt_df["pick_up_pincode"].str.split(".",expand=True)[0]
    # tgt_df["delivery_pincode"] = tgt_df["delivery_pincode"].str.split(".",expand=True)[0]
    
    tgt_df["pick_up_pincode"] = tgt_df["pick_up_pincode"].astype(float).astype(int).astype(str)
    tgt_df["delivery_pincode"] = tgt_df["delivery_pincode"].astype(float).astype(int).astype(str)

    
    print("Populating Delivery Stats.")
    final_df = tgt_df.merge(pc_tbl, left_on="delivery_pincode", 
                                    right_on="Pincode", how="left").drop(columns=[
                                        "Pincode","delivery_district","delivery_state","delivery_state_code"]).rename(
                                            columns={"Statename":"delivery_state",
                                                     "Districtname":"delivery_district", 
                                                     "Statecode":"delivery_state_code"})
    print("Populating Seller Stats.")
    final_df = final_df.merge(pc_tbl, left_on="pick_up_pincode", 
                                              right_on="Pincode", how="left").drop(columns=[
                                                  "Pincode","seller_district","seller_state","seller_state_code"]).rename(
                                                      columns={"Statename":"seller_state",
                                                               "Districtname":"seller_district", 
                                                               "Statecode":"seller_state_code"})
    if final_df.shape[0] == row_count:
        print(final_df.shape[0], row_count)
        print("Rows Match, proceeding.\n")
        final_df.to_parquet(f"{ed.processed_files}_{dt_val}_{file_category}.parquet")
    else:
        print("Row Mismatch.")

In [None]:
tgt_df = pd.read_parquet(logistics_files[0])


In [None]:
tgt_df[["transaction_id", "fulfillment_status", "domain",
        "network_retail_order_id", "shipment_type"]]

In [None]:
tgt_df[["transaction_id", "network_retail_order_id"]]