In [1]:
import dask
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client

In [2]:
def is_dask_df(df):
    return dask.is_dask_collection(df)


def to_dd(df, chunksize=5000):
    if is_dask_df(df):
        return df
    return dd.from_pandas(df, chunksize=chunksize)


def to_df(df, **kwargs):
    if is_dask_df(df):
        return df.compute(**kwargs)
    return df


def dd_read_parquet(paths, optimize_batch_read=True, **kwargs):
    """
    Optimize multiple parquet files read behavior.

    :param paths:
    :type paths:
    :param optimize_batch_read:
    :type optimize_batch_read:
    :param kwargs:
    :type kwargs:
    :return:
    :rtype:
    """
    if not isinstance(paths, list):
        return dd.read_parquet(paths, **kwargs)
    if not optimize_batch_read:
        return dd.read_parquet(paths, **kwargs)
    dfs = []
    for df_path in paths:
        dfs.append(dd.read_parquet(df_path, **kwargs))
    return dd.concat(dfs, interleave_partitions=True)


In [3]:
ddf = dd_read_parquet('/Users/qhkjit/Desktop/mt_day_4.dataset', engine='fastparquet', optimize_batch_read=False)
ddf

Unnamed: 0_level_0,订单ID,商家名称,订单来源,服务费折扣,实付金额,收入小计,基础邮资,时段补贴,冬季补贴,附加补贴,距离补贴,计重补贴,品类补贴,准时补贴,大额单补贴,难度补贴,服务包,配送费,取餐前退款金额,海葵品类,预订单,导航距离(米),重量(kg),区域难度,骑手名称,订单类型,订单状态,品类时段补贴,顾客修改地址费,外卖品类,订单送达/取消时间,品牌补贴,杯数补贴,品牌,杯数,基础邮资调减,meta_month,meta_day
npartitions=35,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
,object,object,object,object,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,object,object,object,object,object,float64,object,object,object,object,object,object,object,object,object,object,object,object,float64,int64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [10]:
ddf.partitions[0]

Unnamed: 0_level_0,订单ID,商家名称,订单来源,服务费折扣,实付金额,收入小计,基础邮资,时段补贴,冬季补贴,附加补贴,距离补贴,计重补贴,品类补贴,准时补贴,大额单补贴,难度补贴,服务包,配送费,取餐前退款金额,海葵品类,预订单,导航距离(米),重量(kg),区域难度,骑手名称,订单类型,订单状态,品类时段补贴,顾客修改地址费,外卖品类,订单送达/取消时间,品牌补贴,杯数补贴,品牌,杯数,基础邮资调减,meta_month,meta_day
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1
,object,object,object,object,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,object,object,object,object,object,float64,object,object,object,object,object,object,object,object,object,object,object,object,float64,int64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [8]:
df = to_df(dask_df)
# df.to_excel('/Users/qhkjit/Desktop/mt_day_4.xlsx')
df.head()

Unnamed: 0,订单ID,商家名称,订单来源,服务费折扣,实付金额,收入小计,基础邮资,时段补贴,冬季补贴,附加补贴,...,顾客修改地址费,外卖品类,订单送达/取消时间,品牌补贴,杯数补贴,品牌,杯数,基础邮资调减,meta_month,meta_day
0,1200658813083534442,奈雪的茶（太原北美新天地店）,平台单,,20.0,8.28,5.65,0.0,0.0,0.0,...,,奶茶,2023-08-01 12:19:48,2.15,,奈雪的茶,1,0.0,202308,20230823
1,1200658782341160919,奈雪的茶（太原北美新天地店）,平台单,,17.1,8.28,5.65,0.0,0.0,0.0,...,,奶茶,2023-08-01 12:16:05,2.15,,奈雪的茶,1,0.0,202308,20230823
2,1200658812049291734,奈雪的茶（太原北美新天地店）,平台单,,26.8,8.28,5.65,0.0,0.0,0.0,...,,奶茶,2023-08-01 12:14:57,2.15,,奈雪的茶,2,0.0,202308,20230823
3,1200658823252349585,奈雪的茶（太原北美新天地店）,平台单,,41.0,8.28,5.65,0.0,0.0,0.0,...,,奶茶,2023-08-01 12:33:12,2.15,,奈雪的茶,3,0.0,202308,20230823
4,1200658824127912577,奈雪的茶（太原北美新天地店）,平台单,,39.1,8.28,5.65,0.0,0.0,0.0,...,,奶茶,2023-08-01 12:34:54,2.15,,奈雪的茶,2,0.0,202308,20230823


In [4]:
client = Client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61763,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:61778,Total threads: 2
Dashboard: http://127.0.0.1:61781/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:61766,
Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-pwlhdq2a,Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-pwlhdq2a

0,1
Comm: tcp://127.0.0.1:61775,Total threads: 2
Dashboard: http://127.0.0.1:61780/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:61767,
Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-inouox8e,Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-inouox8e

0,1
Comm: tcp://127.0.0.1:61776,Total threads: 2
Dashboard: http://127.0.0.1:61779/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:61768,
Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-_e7x7dnb,Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-_e7x7dnb

0,1
Comm: tcp://127.0.0.1:61777,Total threads: 2
Dashboard: http://127.0.0.1:61782/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:61769,
Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-z_fzzqu0,Local directory: /var/folders/jg/g_sml8hs01z_489v147wbhkr0000gn/T/dask-scratch-space/worker-z_fzzqu0
