In [1]:
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import LocalCluster, Client
from multiprocessing import Pool
import os
import time
from dask.distributed import progress

import json
import dask.bag as db

In [2]:
client = Client(n_workers=4, threads_per_worker=1, memory_limit='3GB')

### Using 4 CPU cores parallelization

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:41585  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 11.18 GiB


In [4]:
chunk = 100000

### Read json files into dataframe

In [5]:
print('')
mybag = db.read_text('*.json.gz').map(json.loads).repartition(chunk)

df = mybag.to_dataframe()
df.head()




Unnamed: 0,id,type,actor,repo,payload,public,created_at
0,2489368070,PushEvent,"{'id': 9152315, 'login': 'davidjhulse', 'grava...","{'id': 28635890, 'name': 'davidjhulse/davesbin...","{'push_id': 536740396, 'size': 1, 'distinct_si...",True,2015-01-01T00:00:00Z
1,2489368072,PushEvent,"{'id': 5581438, 'login': 'jmoon018', 'gravatar...","{'id': 26392647, 'name': 'jmoon018/rshell-unit...","{'push_id': 536740397, 'size': 1, 'distinct_si...",True,2015-01-01T00:00:00Z
2,2489368089,CreateEvent,"{'id': 6352424, 'login': 'christoferpeterson',...","{'id': 28677542, 'name': 'christoferpeterson/V...","{'ref': 'master', 'ref_type': 'branch', 'maste...",True,2015-01-01T00:00:01Z
3,2489368095,PushEvent,"{'id': 66577, 'login': 'JakeWharton', 'gravata...","{'id': 5152285, 'name': 'square/okhttp', 'url'...","{'push_id': 536740405, 'size': 8, 'distinct_si...",True,2015-01-01T00:00:01Z
4,2489368104,PushEvent,"{'id': 9221683, 'login': 'git4ruby', 'gravatar...","{'id': 28520835, 'name': 'git4ruby/movie_revie...","{'push_id': 536740413, 'size': 1, 'distinct_si...",True,2015-01-01T00:00:03Z


#### dataframe is not loaded into memory

In [6]:
df

Unnamed: 0_level_0,id,type,actor,repo,payload,public,created_at
npartitions=100000,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,object,object,object,object,object,bool,object
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


### Unpacking actor,repo,payload tables into distinct dataframes

In [7]:
# extracting table actor
bag_actor = df['actor'].to_bag()
df_actor = bag_actor.to_dataframe()

df = df.drop('actor', axis=1)

# extracting table repo
bag_repo = df['repo'].to_bag()
df_repo = bag_repo.to_dataframe()

df = df.drop('repo', axis=1)


# extracting table payload
bag_payload = df['payload'].to_bag()
df_payload = bag_repo.to_dataframe()

df = df.drop('payload', axis=1)



In [8]:
df_actor.head()

Unnamed: 0,id,login,gravatar_id,url,avatar_url
0,9152315,davidjhulse,,https://api.github.com/users/davidjhulse,https://avatars.githubusercontent.com/u/9152315?
1,5581438,jmoon018,,https://api.github.com/users/jmoon018,https://avatars.githubusercontent.com/u/5581438?
2,6352424,christoferpeterson,,https://api.github.com/users/christoferpeterson,https://avatars.githubusercontent.com/u/6352424?
3,66577,JakeWharton,,https://api.github.com/users/JakeWharton,https://avatars.githubusercontent.com/u/66577?
4,9221683,git4ruby,,https://api.github.com/users/git4ruby,https://avatars.githubusercontent.com/u/9221683?


In [9]:
df_repo.head()

Unnamed: 0,id,name,url
0,28635890,davidjhulse/davesbingrewardsbot,https://api.github.com/repos/davidjhulse/daves...
1,26392647,jmoon018/rshell-unit-tester,https://api.github.com/repos/jmoon018/rshell-u...
2,28677542,christoferpeterson/Vadek,https://api.github.com/repos/christoferpeterso...
3,5152285,square/okhttp,https://api.github.com/repos/square/okhttp
4,28520835,git4ruby/movie_review1,https://api.github.com/repos/git4ruby/movie_re...


In [10]:
df_payload.head()

Unnamed: 0,id,name,url
0,28635890,davidjhulse/davesbingrewardsbot,https://api.github.com/repos/davidjhulse/daves...
1,26392647,jmoon018/rshell-unit-tester,https://api.github.com/repos/jmoon018/rshell-u...
2,28677542,christoferpeterson/Vadek,https://api.github.com/repos/christoferpeterso...
3,5152285,square/okhttp,https://api.github.com/repos/square/okhttp
4,28520835,git4ruby/movie_review1,https://api.github.com/repos/git4ruby/movie_re...


In [11]:
df.head()

Unnamed: 0,id,type,public,created_at
0,2489368070,PushEvent,True,2015-01-01T00:00:00Z
1,2489368072,PushEvent,True,2015-01-01T00:00:00Z
2,2489368089,CreateEvent,True,2015-01-01T00:00:01Z
3,2489368095,PushEvent,True,2015-01-01T00:00:01Z
4,2489368104,PushEvent,True,2015-01-01T00:00:03Z
