Copyright (c) 2020, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

#### ~~Tweet_id is encoded using approximate hashing which caused hashing collisions. Exact encoding of tweet_id is blocked by a bug in cudf. All other columns are using exact encoding.~~ 
Fixed

In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
start = time.time()
very_start = time.time()

In [2]:
#import pandas as pd, 
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.max_rows', 500)
import cudf, cupy, time, rmm
cudf.__version__

'0.14.0'

In [3]:
import dask as dask, dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster
import subprocess

In [4]:
cluster = LocalCUDACluster(ip='10.2.61.36',protocol="ucx", 
                           rmm_pool_size="31GB",
                           enable_tcp_over_ucx=True, enable_nvlink=True)
#cluster = LocalCUDACluster()
client = Client(cluster)
client

0,1
Client  Scheduler: ucx://10.2.61.36:38271  Dashboard: http://10.2.61.36:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 270.39 GB


In [5]:
%%time

NP = 16
path = '/raid/data/recsys'
df = dask_cudf.read_csv(f'{path}/input/training.tsv', sep='\x01', header=None)#, dtype=DTYPES)
df = df.repartition(npartitions=NP)
df, = dask.persist(df)
_ = wait(df)
print('number of rows:',len(df))

number of rows: 121386431
CPU times: user 2.73 s, sys: 513 ms, total: 3.25 s
Wall time: 12.4 s


In [6]:
%%time
features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'a_user_id',              ###########################
    'a_follower_count',       #Engaged With User Features
    'a_following_count',      #
    'a_is_verified',          #
    'a_account_creation',     ###########################
    'b_user_id',              #######################
    'b_follower_count',       #Engaging User Features
    'b_following_count',      #
    'b_is_verified',          #
    'b_account_creation',     #######################
    'b_follows_a',    #################### Engagement Features
    'reply',          #Target Reply
    'retweet',        #Target Retweet    
    'retweet_comment',#Target Retweet with comment
    'like',           #Target Like
                      ####################
]
df.columns = features

df = df.drop('text_tokens', axis=1)
df, = dask.persist(df)
_ = wait(df)
#df.head()

CPU times: user 416 ms, sys: 39.2 ms, total: 455 ms
Wall time: 2.85 s


In [7]:
df.dtypes

hashtags               object
tweet_id               object
media                  object
links                  object
domains                object
tweet_type             object
language               object
timestamp               int64
a_user_id              object
a_follower_count        int64
a_following_count       int64
a_is_verified            bool
a_account_creation      int64
b_user_id              object
b_follower_count        int64
b_following_count       int64
b_is_verified            bool
b_account_creation      int64
b_follows_a              bool
reply                 float64
retweet               float64
retweet_comment       float64
like                  float64
dtype: object

In [8]:
%%time
df['id']   = 1
df['id']   = df['id'].cumsum()
df['id'] = df['id'].astype('int32')

df['reply']   = df['reply'].fillna(0)
df['retweet'] = df['retweet'].fillna(0)
df['retweet_comment'] = df['retweet_comment'].fillna(0)
df['like']    = df['like'].fillna(0)

df['reply']   = df['reply'].astype('int32')
df['retweet'] = df['retweet'].astype('int32')
df['retweet_comment'] = df['retweet_comment'].astype('int32')
df['like']    = df['like'].astype('int32')
df, = dask.persist(df)
_ = wait(df)

CPU times: user 2.76 s, sys: 3.54 s, total: 6.3 s
Wall time: 3.47 s


In [9]:
%%time

df['timestamp']         = df['timestamp'].astype( np.int32 )
df['a_follower_count']  = df['a_follower_count'].astype( np.int32 )
df['a_following_count'] = df['a_following_count'].astype( np.int32 )
df['a_account_creation']= df['a_account_creation'].astype( np.int32 )
df['b_follower_count']  = df['b_follower_count'].astype( np.int32 )
df['b_following_count'] = df['b_following_count'].astype( np.int32 )
df['b_account_creation']= df['b_account_creation'].astype( np.int32 )

df, = dask.persist(df)
_ = wait(df)
df.head()

CPU times: user 708 ms, sys: 58 ms, total: 766 ms
Wall time: 2.08 s


Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,...,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id
0,,E7D6C5094767223F6F8789A87A1937AB,,,,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1581262691,D557B03872EF8986F7F4426AE094B2FE,986,...,94,648,False,1478011810,False,0,0,0,0,1
1,83D6C79F5FCEC8D1CAD9E82C2C261611\tFFAD2DCF664C...,129F4A868712BA2B98D31AF98C3066E4,,,,Retweet,22C448FF81263D4BAF2A176145EE9EAD,1581497241,424822AC982CE0E8965506C63B44EC12,1225,...,1139,46,False,1540395738,True,0,1581497559,0,1581497622,2
2,,04C6C2175852CDBBC23B2446C7E7C22D,,DDFFB4C01DB85921C3580F614575AA6D,BE4539C53C53FFABCFD232DB100C792B,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1580978528,1EC14E26417AA926095530AC591BA9CE,3016,...,780,440,False,1432084055,True,0,0,0,1581060554,3
3,,168157826315514C120494D4DF8E6216,,,,Retweet,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581321849,9B9595B6FEB8948BDDF0D222F27E0118,2121,...,1,45,False,1534313747,False,0,0,0,1581328518,4
4,,B3E3673782A69D9D8A45D3B222F0B073,Photo,,,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1580956787,525DC99B7CB8F1AC4AD3E66C53FA38E0,813505,...,171,388,False,1490166885,False,0,0,0,1580957807,5


In [10]:
df.dtypes

hashtags              object
tweet_id              object
media                 object
links                 object
domains               object
tweet_type            object
language              object
timestamp              int32
a_user_id             object
a_follower_count       int32
a_following_count      int32
a_is_verified           bool
a_account_creation     int32
b_user_id             object
b_follower_count       int32
b_following_count      int32
b_is_verified           bool
b_account_creation     int32
b_follows_a             bool
reply                  int32
retweet                int32
retweet_comment        int32
like                   int32
id                     int32
dtype: object

In [11]:
%%time
dv = dask_cudf.read_csv(f'{path}/input/val.tsv', sep='\x01', header=None)
dv = dv.repartition(npartitions=NP)
dv, = dask.persist(dv)
_ = wait(dv)
print('number of rows:',len(dv))

number of rows: 12434735
CPU times: user 309 ms, sys: 49.9 ms, total: 359 ms
Wall time: 1.26 s


In [12]:
%%time
features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'a_user_id',              ###########################
    'a_follower_count',       #Engaged With User Features
    'a_following_count',      #
    'a_is_verified',          #
    'a_account_creation',     ###########################
    'b_user_id',              #######################
    'b_follower_count',       #Engaging User Features
    'b_following_count',      #
    'b_is_verified',          #
    'b_account_creation',     #######################
    'b_follows_a',    #################### Engagement Features
    #'reply',          #Target Reply
    #'retweet',        #Target Retweet    
    #'retweet_comment',#Target Retweet with comment
    #'like',           #Target Like
                      ####################
]
dv.columns = features
dv = dv.drop('text_tokens', axis=1)
dv, = dask.persist(dv)
_ = wait(dv)

CPU times: user 126 ms, sys: 8.06 ms, total: 134 ms
Wall time: 166 ms


In [13]:
%%time

dv['reply']           = 0
dv['retweet']         = 0
dv['retweet_comment'] = 0
dv['like']            = 0

dv['id']   = 1
dv['id']   = dv['id'].cumsum()
dv['id'] = dv['id'] + len(df)
dv['id'] = dv['id'].astype('int32')

dv['reply']           = dv['reply'].astype( np.int32 )
dv['retweet']         = dv['retweet'].astype( np.int32 )
dv['retweet_comment'] = dv['retweet_comment'].astype( np.int32 )
dv['like']            = dv['like'].astype( np.int32 )

dv['timestamp']         = dv['timestamp'].astype( np.int32 )
dv['a_follower_count']  = dv['a_follower_count'].astype( np.int32 )
dv['a_following_count'] = dv['a_following_count'].astype( np.int32 )
dv['a_account_creation']= dv['a_account_creation'].astype( np.int32 )
dv['b_follower_count']  = dv['b_follower_count'].astype( np.int32 )
dv['b_following_count'] = dv['b_following_count'].astype( np.int32 )
dv['b_account_creation']= dv['b_account_creation'].astype( np.int32 )
dv, = dask.persist(dv)
_ = wait(dv)

CPU times: user 1.28 s, sys: 54.6 ms, total: 1.33 s
Wall time: 1.69 s


In [14]:
%%time
dt = dask_cudf.read_csv(f'{path}/input/competition_test.tsv', sep='\x01', header=None)
dt = dt.repartition(npartitions=NP)
dt, = dask.persist(dt)
_ = wait(dt)

CPU times: user 223 ms, sys: 29.9 ms, total: 252 ms
Wall time: 1.19 s


In [15]:
%%time
dt.columns = features
dt = dt.drop('text_tokens', axis=1)
dt, = dask.persist(dt)
_ = wait(dt)
print('number of rows:',len(dt))

number of rows: 12434838
CPU times: user 164 ms, sys: 9.26 ms, total: 173 ms
Wall time: 224 ms


In [16]:
%%time
dt['reply']           = 0
dt['retweet']         = 0
dt['retweet_comment'] = 0
dt['like']            = 0

dt['id']   = 1
dt['id']   = dt['id'].cumsum()
dt['id']   = dt['id']+len(df)+len(dv)
dt['id']   = dt['id'].astype('int32')

dt['reply']           = dt['reply'].astype( np.int32 )
dt['retweet']         = dt['retweet'].astype( np.int32 )
dt['retweet_comment'] = dt['retweet_comment'].astype( np.int32 )
dt['like']            = dt['like'].astype( np.int32 )

dt['timestamp']         = dt['timestamp'].astype( np.int32 )
dt['a_follower_count']  = dt['a_follower_count'].astype( np.int32 )
dt['a_following_count'] = dt['a_following_count'].astype( np.int32 )
dt['a_account_creation']= dt['a_account_creation'].astype( np.int32 )
dt['b_follower_count']  = dt['b_follower_count'].astype( np.int32 )
dt['b_following_count'] = dt['b_following_count'].astype( np.int32 )
dt['b_account_creation']= dt['b_account_creation'].astype( np.int32 )

dt, = dask.persist(dt)
_ = wait(dt)
print(df.shape,dv.shape,dt.shape)
dt.head()

(Delayed('int-5a2f7cf3-929e-499d-a027-b14667a91d84'), 24) (Delayed('int-8adc3760-4900-4e29-8570-6569817bb49c'), 24) (Delayed('int-d3a42868-850b-44bd-92b0-2b9dac4d9342'), 24)
CPU times: user 1.41 s, sys: 57.5 ms, total: 1.47 s
Wall time: 2.26 s


Unnamed: 0,hashtags,tweet_id,media,links,domains,tweet_type,language,timestamp,a_user_id,a_follower_count,...,b_follower_count,b_following_count,b_is_verified,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id
0,,04746004AA1F5498834CE7A4C6343D1A,,,,TopLevel,22C448FF81263D4BAF2A176145EE9EAD,1581759640,6720CC7830F94CB7465CA283300DB010,119,...,111,673,False,1478011810,True,0,0,0,0,133821167
1,024FE90EC2C01B3CDC46A5A90D66B020\t1B78BDD9C7FF...,B5C4CBE185831F3E5A58A4D81118D4C7,,,,Retweet,22C448FF81263D4BAF2A176145EE9EAD,1581668217,7DDC67265CFB6E0B4820E0BD0E33A8D3,189,...,111,673,False,1478011810,True,0,0,0,0,133821168
2,,6B6836351BFAA6D1CC1EB0386BCB8C6A,Video,,,Retweet,D3164C7FBCF2565DDF915B1B3AEFB1DC,1582046459,5456A10C7E4F7A415948EA88BE6845D6,4312,...,1150,48,False,1540395738,True,0,0,0,0,133821169
3,2D09C59493DAC82D8054E79343DFE76A,0DCF558E40500F22F84F98C4E7C38EDC,Photo,,,Retweet,125C57F4FA6D4E110983FB11B52EFD4E,1582083666,9D421C234C7B59A0EDC8D85C847D4569,272,...,781,442,False,1432084055,True,0,0,0,0,133821170
4,A0AD2EB95B7C918A311D9432E9A8FF7A,F13AA57F12DD6107D9D8544A27BDE9EC,Photo,,,Retweet,D3164C7FBCF2565DDF915B1B3AEFB1DC,1581779241,F63ECD1C7827E767E7C44E9A717056AC,1020,...,15,123,False,1385502405,False,0,0,0,0,133821171


In [17]:
train_size = len(df)#.shape[0]
test0_size = len(dv)#.shape[0]
test1_size = len(dt)#.shape[0]
print(train_size,test0_size,test1_size)

121386431 12434735 12434838


In [18]:
%%time
df = dask_cudf.concat( [df,dv,dt] )
df, = dask.persist(df)
wait(df)
del dv, dt

CPU times: user 180 ms, sys: 3.85 ms, total: 184 ms
Wall time: 178 ms


In [19]:
df.head()['language']

0    22C448FF81263D4BAF2A176145EE9EAD
1    22C448FF81263D4BAF2A176145EE9EAD
2    22C448FF81263D4BAF2A176145EE9EAD
3    D3164C7FBCF2565DDF915B1B3AEFB1DC
4    22C448FF81263D4BAF2A176145EE9EAD
Name: language, dtype: object

In [20]:
df.info()

<class 'dask_cudf.core.DataFrame'>
Columns: 24 entries, hashtags to id
dtypes: object(9), bool(3), int32(12)

In [21]:
print(df.npartitions,len(df))

48 146256004


In [22]:
"""
%%time
df['tweet_id'] = df['tweet_id'].map_partitions(lambda cudf:cudf.hash_encode(stop=1_000_000_000))
df['tweet_id'] = df['tweet_id'].astype( np.int32 )
#df['tweet_id'] = df['tweet_id'].map_partitions(lambda cudf:cudf.hash_values()%1_000_000_000)
df, = dask.persist(df)
_ = wait(df)
df.head()
"""

"\n%%time\ndf['tweet_id'] = df['tweet_id'].map_partitions(lambda cudf:cudf.hash_encode(stop=1_000_000_000))\ndf['tweet_id'] = df['tweet_id'].astype( np.int32 )\n#df['tweet_id'] = df['tweet_id'].map_partitions(lambda cudf:cudf.hash_values()%1_000_000_000)\ndf, = dask.persist(df)\n_ = wait(df)\ndf.head()\n"

In [23]:
%%time
df['media'] = df['media'].fillna( '' )
def split_join(ds,sep):
    df = ds.str.split(sep)
    df[0] = df[0].fillna('')
    df[1] = df[1].fillna('')
    res = df[0]+'_'+df[1]
    del df
    return res

df['media'] = df['media'].map_partitions( lambda x:  split_join(x,'\t'), meta=('O'))

df, = dask.persist(df)
_ = wait(df)

CPU times: user 516 ms, sys: 34.1 ms, total: 551 ms
Wall time: 929 ms


In [24]:
def factorize_small_cardinality(df,col):
    tmp_col = f'{col}_encode'
    tmp = df[col].unique().compute()
    tmp = tmp.to_frame().reset_index()
    tmp = tmp.rename(mapper={'index':tmp_col})
    df = df.merge(tmp,on=col,how='left')
    df, = dask.persist(df)
    wait(df)
    head=df.head()
    del tmp
    df = df.drop(col,axis=1)
    df, = dask.persist(df)
    wait(df)
    df.columns = [i if i!=tmp_col else col for i in df.columns ]
    return df,head

In [25]:
%%time
for col in ['language','tweet_type','media']:
    df,_ = factorize_small_cardinality(df,col)
    df[col] = df[col].astype('int8')

CPU times: user 2.94 s, sys: 210 ms, total: 3.15 s
Wall time: 5.71 s


In [26]:
df.head()

Unnamed: 0,hashtags,tweet_id,links,domains,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,...,b_account_creation,b_follows_a,reply,retweet,retweet_comment,like,id,language,tweet_type,media
0,,0A679DD46B17082ECB54EB8D98258FB8,,,1580986075,0133906BF22993CDEE417D41BB5A0FAC,9993,8570,False,1481293579,...,1525806228,True,0,0,0,0,18593,63,0,12
1,,F1CDA01EC4815A2D974DB296A609059A,,,1581474056,E3A820CAF4B0044C737C8ED8C4F57828,31707,30367,False,1249459733,...,1486151643,True,0,0,0,0,18594,54,1,12
2,6012776AECA5D0EC9EBF4B8D72FF555A,E15954621FFA57271636473E104C2088,,,1581198029,84C77F8066337B94F3CA3DD07E2C0F3A,2582264,1043,True,1288129925,...,1557711616,False,0,0,0,0,18595,54,2,4
3,,2D944D13F92A3652BF3D431D28F9B931,,,1580967091,15ED5A140B41D20C1E6FF654FB02591F,251,418,False,1443844675,...,1540838549,True,0,0,0,1580981213,18596,54,0,12
4,,8BE2354E4F03DCF2F34B7A1917254531,ADCE39195E117002CD6DD0C2B5C0C31A,BBA1D3359A23689C6783652EA73B12BA,1581536516,5480E8F28BFC097044990E17C91A6292,510762,2118,True,1262371833,...,1534792958,False,0,0,0,0,18597,59,2,4


In [27]:
%%time
tweet = df[['tweet_id']]
tweet = tweet.drop_duplicates(split_out=16)
tweet['tweet_encode'] = 1
tweet['tweet_encode'] = tweet['tweet_encode'].cumsum()
tweet, = dask.persist(tweet)
_ = wait(tweet)
tweet.head()

CPU times: user 2.08 s, sys: 161 ms, total: 2.25 s
Wall time: 6.38 s


Unnamed: 0,tweet_id,tweet_encode
1109137,0000012429A02D1B5C871FBA53A0C4DD,1
2999311,0000043B5500353E778A6B78498EE7CD,2
7068428,000004B4208284C156C06BCFAB500ACC,3
4666149,00000BF2119CD5F74998D3D407F15DB8,4
107413,00000D200443B9FE776CA28D56ECAD3C,5


In [28]:
%%time
df = df.merge(tweet,on='tweet_id',how='left')
df = df.drop('tweet_id',axis=1)
df.columns = [i if i!='tweet_encode' else 'tweet_id' for i in df.columns]
df, = dask.persist(df)
wait(df)
del tweet
df.head()

CPU times: user 6.45 s, sys: 517 ms, total: 6.97 s
Wall time: 27.9 s


Unnamed: 0,hashtags,links,domains,timestamp,a_user_id,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_user_id,...,b_follows_a,reply,retweet,retweet_comment,like,id,language,tweet_type,media,tweet_id
0,,,,1581525142,46019E3576BD47D19BC83E3A8B04EC35,7717,7688,False,1245066162,4992CD23C7A65F9D6CB1F6067D94D09C,...,False,0,0,0,0,1516532,54,1,12,2725610
1,,,,1581252945,185A653D674A8BDA4C351617C84E5311,781,498,False,1476364476,5FAA46A494872C5210B49C5CFCFAD331,...,True,0,1581255876,0,0,1971993,0,1,12,4294073
2,,,,1581550694,BF36F03D098B39CE46FAC236FD95B880,804689,2437,True,1277762928,13B5FC6FFA16C89544196B61E65DEA51,...,False,0,0,0,0,407133,54,1,12,3677744
3,,,,1581278198,1C024872A6C1EE11014644B375FEEFB8,303,370,False,1498074109,5540A2D31464BA89D5749F31399F5322,...,True,0,0,0,1581278784,1756945,54,2,6,3582362
4,,65D3B775D8D7AE83D5D9839C1FE75CDD,D0120C5A771484D1765BBA60461666B9,1580986022,F4DCB14DEA2FB8721C75846B549AFE2B,411495,12,True,1246733300,65834DED849800C03887F514B3885DD4,...,False,0,0,0,0,2092582,38,2,4,2408726


In [29]:
#%%time
#df = df.repartition(npartitions=1024)
#df, = dask.persist(df)
#_ = wait(df)

In [30]:
%%time
user_a = df[['a_user_id']].drop_duplicates(split_out=16)
user_a, = dask.persist(user_a)
_ = wait(user_a)
user_b = df[['b_user_id']].drop_duplicates(split_out=16)
user_b, = dask.persist(user_b)
wait(user_b)
print(len(user_a),len(user_b),len(df))

user_a.columns = ['user_id']
user_b.columns = ['user_id']
user_b['dummy'] = 1
user_a = user_a.merge(user_b,on='user_id',how='outer')
user_a = user_a.drop('dummy',axis=1)
user_a, = dask.persist(user_a)
wait(user_a)
print(len(user_a),len(user_b),len(df))
del user_b

user_a['user_encode'] = 1
user_a['user_encode'] = user_a['user_encode'].cumsum()
user_a, = dask.persist(user_a)
_ = wait(user_a)

13774339 25315553 146256004
30870475 25315553 146256004
CPU times: user 5.19 s, sys: 347 ms, total: 5.54 s
Wall time: 12.7 s


In [31]:
%%time
df = df.merge(user_a,left_on='a_user_id',right_on='user_id',how='left')
df = df.drop(['a_user_id','user_id'],axis=1)
df.columns = [i if i!='user_encode' else 'a_user_id' for i in df.columns]
df, = dask.persist(df)
_ = wait(df)

CPU times: user 3.82 s, sys: 275 ms, total: 4.09 s
Wall time: 12.4 s


In [32]:
%%time
df = df.merge(user_a,left_on='b_user_id',right_on='user_id',how='left')
df = df.drop(['b_user_id','user_id'],axis=1)
df.columns = [i if i!='user_encode' else 'b_user_id' for i in df.columns]
df, = dask.persist(df)
wait(df)
del user_a
df.head()

CPU times: user 3.78 s, sys: 315 ms, total: 4.1 s
Wall time: 12.2 s


Unnamed: 0,hashtags,links,domains,timestamp,a_follower_count,a_following_count,a_is_verified,a_account_creation,b_follower_count,b_following_count,...,retweet,retweet_comment,like,id,language,tweet_type,media,tweet_id,a_user_id,b_user_id
0,,,,1581167611,38884,145,False,1537006624,12,252,...,0,0,1581170593,6635832,33,2,8,72414311,1493523,289386
1,,,,1581031485,54153,0,False,1564526796,176,265,...,0,0,1581049466,59662884,47,2,8,37610756,1332557,1176575
2,,,,1581115827,476,614,False,1370644514,492,218,...,0,0,0,17001753,3,1,12,72363440,1118690,319351
3,,,,1581289083,254,12,False,1507166362,29,105,...,0,0,0,97962265,4,2,4,15680803,1248545,684765
4,,,,1581367022,1009,506,False,1240020633,543,450,...,0,0,1581384002,38541399,54,2,12,4639756,1680120,433350


In [33]:
%%time
df = df.repartition(npartitions=NP)
df, = dask.persist(df)
_ = wait(df)

CPU times: user 103 ms, sys: 10.5 ms, total: 114 ms
Wall time: 254 ms


In [34]:
%%time
df.to_parquet(f'{path}/dask_input/step1_output',write_index=False)

CPU times: user 1.52 s, sys: 88.6 ms, total: 1.61 s
Wall time: 11.1 s


In [35]:
print('This notebook took %.1f minutes'%((time.time()-very_start)/60.))

This notebook took 2.1 minutes
