# Reading .json files using DASK

I'm creating this notebook to show you guys how i managed to speed up the read process of the .json files, i came across a few methods on other people's notebooks and it was taking around 13 minutes to read all 14k train files, a few months ago i came across a notebook that was loading many files at once using dask, so i tried to do something similar here, using dask the time to read was reduced to 4 minutes (on my 8 core machine), the upside of using this is that you will be able to run the same code in a personal laptop or a several-cores server, just adjusting the number of workers.

Dask can process cpu tasks in parallel, so the same method can be adapted to process any cpu intensive tasks that you might be using.

In [None]:
import re
import json
import dask
import numpy as np
import pandas as pd
from tqdm import tqdm
from dask.distributed import Client, wait, LocalCluster

In [None]:
BASE = '../input/coleridgeinitiative-show-us-the-data'

PARAMS = {
    'base_path': {BASE},
    'labels_file': f'{BASE}/train.csv',
    'train_folder': f'{BASE}/train',
#     'labels_file': f'{BASE}/sample_submission.csv',
#     'publications_path': f'{BASE}/test',
}

In [None]:
def clean_text(input_words):
    return re.sub('[^A-Za-z0-9\[\]]+', ' ', str(input_words).lower()).strip()

In [None]:
train_df = pd.read_csv(PARAMS['labels_file'])
display(train_df)

In [None]:
train_files = train_df['Id'].unique()
print('unique files: {}'.format(len(train_files)))

# Without dask

In [None]:
%%time
futures = []
for file in tqdm(train_files):

    file_dfs = []
    section = {}
    
    with open(PARAMS['train_folder']+'/'+ file +'.json', 'r') as f:
        paper = json.load(f)

    dfs = []    
    len_paper = len(paper)
        
    for section_index in range (0, len_paper):
        section_sentences = paper[section_index].get('text')
        
        section['file'] = file
        section['orig_sentence'] = section_sentences
        
        df = pd.DataFrame.from_dict(section, orient = 'index').T
        dfs.append(df)
        
        file_df = pd.concat(dfs)
               
    file_dfs.append(file_df)
    file_df = pd.concat(file_dfs)
    
    futures.append(file_df)

In [None]:
sentences_df = pd.concat(futures).reset_index(drop=True)
display(sentences_df)

## Start a local cluster on the machine

workers = 4 means that we are going to use all 4 cores to process

In [None]:
# set n_workers to number of cores
client = Client(n_workers=4, threads_per_worker=4)
client

In [None]:
def extract_text_from_pub(file):

    file_dfs = []
    section = {}
    
    with open(PARAMS['train_folder']+'/'+ file +'.json', 'r') as f:
        paper = json.load(f)

    dfs = []    
    len_paper = len(paper)
        
    for section_index in range (0, len_paper):
        section_sentences = paper[section_index].get('text')
        
        section['file'] = file
        section['orig_sentence'] = section_sentences
        
        df = pd.DataFrame.from_dict(section, orient = 'index').T
        dfs.append(df)
        
        file_df = pd.concat(dfs)
               
    file_dfs.append(file_df)
    file_df = pd.concat(file_dfs)
    
    return file_df

NOTE: tqdm bar won't work properly here, it will only show the time it took to send the parameters to dask, so it will reach 100% much faster than the cell is going to finish

In [None]:
%%time
# you will notice that the kernel CPU usage are close to 400% while this is running

futures = [] # save the future since dask is lazy, otherwise nothing is executed.
for file in tqdm(train_files):
    f = client.submit(extract_text_from_pub,file) # pass the function to be executed and the file id
    futures.append(f)
_ = wait(futures)

In [None]:
%%time
paragraphs = []
for f in tqdm(futures):
    p = f.result()
    paragraphs.append(p)

In [None]:
sentences_df = pd.concat(paragraphs).reset_index(drop=True)
display(sentences_df)