In [None]:
#load packages
import pymc3 as pm
from datetime import datetime, timedelta, time, date
from scipy import stats
import json
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.io import gbq
from google.cloud import bigquery
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import scipy.stats as stats
#import geopandas
from shapely import wkt
import scipy
from scipy import signal
%matplotlib inline
from IPython.core.pylabtools import figsize
from google.colab import drive # to allow colab save file in my drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()
print('Authenticated')
cwd=os.getcwd()
cwd
%unload_ext google.colab.data_table
print(f"Running on PyMC3 v{pm.__version__}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Authenticated
The google.colab.data_table extension is not loaded.
Running on PyMC3 v3.7


# load data

In [None]:
project_id = "logistics-data-staging-flat"
client = bigquery.Client(project = project_id)

query = """
with countries as (
select rdbms_id, entity_id,country_iso 
from `dhh---analytics-apac.pandata.dim_countries`
group by 1,2,3
order by 3
)

select
  entity_id
  , order_code_google
  , cen_order.value.commission_local as central_commission
  , IFNULL(order_commissions.commission_local,
    CASE WHEN fct_orders.country_name in ('Thailand', 'Philippines', 'Hong Kong')
    THEN commission_base_local * commission_percentage_combined
    ELSE fct_orders.commission_local END) as reg_commission,
from `dhh---analytics-apac.pandata.fct_orders` as fct_orders
left join countries co on fct_orders.rdbms_id = co.rdbms_id
left join `dhh---analytics-apac.pandata_report.order_commissions` as order_commissions
         on order_commissions.rdbms_id = fct_orders.rdbms_id
        and order_commissions.order_code = fct_orders.code
        and order_commissions.billing_tool != 'backend'
left join `fulfillment-dwh-production.cl_central_dwh.orders` cen_order on co.entity_id = cen_order.global_entity_id and cen_order.order_id = fct_orders.order_code_google
where is_billable
      AND fct_orders.date_utc between "2021-01-01" and "2021-01-07"
      AND fct_orders.created_date_local < current_date()
group by 1,2,3,4
    """

df = client.query(query).to_dataframe()

In [None]:
df.head()

Unnamed: 0,entity_id,order_code_google,central_commission,reg_commission
0,FP_JP,l8jr-ao7i,412,412.0
1,FP_JP,a9qz-85uv,660,660.0
2,FP_JP,g8fl-flqj,2188,2188.0
3,FP_JP,a5vd-z32v,695,695.0
4,FP_JP,b3gk-br9a,234,234.0


## check missing value

In [None]:
# find out how many orders are without commission 
print("central data", df['central_commission'].isna().sum())
print("regional data", df['reg_commission'].isna().sum())

central data 220
regional data 0


In [None]:
df = df.dropna()
df['central_commission'] = df['central_commission'].astype(float)

In [None]:
df.describe()

Unnamed: 0,central_commission,reg_commission
count,13529940.0,13529940.0
mean,133.4658,134.4991
std,1144.202,1144.126
min,-2322.0,-2322.0
25%,9.73,11.44
50%,29.0,30.0
75%,56.07,57.53571
max,763200.0,763200.0


## check unequal ones

In [None]:
df['equal'] = np.where(df['central_commission'] == df['reg_commission'], True, False)

In [None]:
equal = df.pivot_table("order_code_google",index="entity_id",columns="equal",aggfunc="count").reset_index()
equal = equal.fillna(0)
equal["total"] = equal[False] + equal[True]
equal["false_percentage"] = equal[False]/equal["total"]

In [None]:
fig = px.bar(equal, x='entity_id', y='false_percentage')
fig.show()

In [None]:
df['equal'].value_counts()

True     9840425
False    3689510
Name: equal, dtype: int64

## Check descriptive data

In [None]:
df_melt = pd.melt(df, id_vars = ["entity_id","order_code_google"], value_vars = ["central_commission", "reg_commission"],var_name = "source", value_name = "commission")

In [None]:
df_melt.head()

Unnamed: 0,entity_id,order_code_google,source,commission
0,FP_JP,l8jr-ao7i,central_commission,412.0
1,FP_JP,a9qz-85uv,central_commission,660.0
2,FP_JP,g8fl-flqj,central_commission,2188.0
3,FP_JP,a5vd-z32v,central_commission,695.0
4,FP_JP,b3gk-br9a,central_commission,234.0


In [None]:
entities = df_melt.entity_id.unique().tolist()
sources = df_melt.source.unique().tolist()

In [None]:
# create empty array list
rows_list = []

for entity in entities:
  for source in sources:
    df_agg = df_melt[df_melt["entity_id"] == entity]
    df_agg = df_agg[df_agg["source"] == source]
    sum = df_agg["commission"].sum()
    mean = df_agg["commission"].mean()
    std = df_agg["commission"].std()
    min = df_agg["commission"].min()
    quantile25 = df_agg["commission"].quantile(.25) 
    median = df_agg["commission"].median()
    quantile75 = df_agg["commission"].quantile(.75) 
    max = df_agg["commission"].max() 
    rows_list.append([entity, source,sum,mean,
                      std,
                      min, quantile25,median,quantile75,max])

result = pd.DataFrame(rows_list, columns=['entity','source','sum','mean','std',
                                          'min','quantile25','median','quantile75','max'])
result

Unnamed: 0,entity,source,sum,mean,std,min,quantile25,median,quantile75,max
0,FP_JP,central_commission,14918620.0,454.351241,294.22863,0.0,273.0,383.0,546.0,6583.0
1,FP_JP,reg_commission,14918620.0,454.351241,294.22863,0.0,273.0,383.0,546.0,6583.0
2,FP_BD,central_commission,16726890.0,53.392601,75.251184,0.0,12.1,36.4,64.4,4510.0
3,FP_BD,reg_commission,16721210.0,53.374478,75.234645,0.0,12.1,36.4,64.4,4510.0
4,FP_HK,central_commission,22355940.0,39.906864,49.974692,0.0,19.2,30.24,47.52,3358.32
5,FP_HK,reg_commission,22355940.0,39.906864,49.974692,0.0,19.2,30.24,47.52,3358.32
6,FP_KH,central_commission,150770.8,1.106818,0.944394,-0.08,0.53,0.83,1.35,31.54
7,FP_KH,reg_commission,150770.8,1.106818,0.944394,-0.08,0.53,0.83,1.35,31.54
8,FP_LA,central_commission,989517700.0,9854.674199,8672.365659,0.0,4500.0,7500.0,12000.0,763200.0
9,FP_LA,reg_commission,989517700.0,9854.674199,8672.365659,0.0,4500.0,7500.0,12000.0,763200.0
