## Method 7: Dask

All code below is referenced from Lecture_1_2.ipynb provided by Gittu George for DSCI 525

In [1]:
# import packages

import dask.dataframe as dd
import re
import os
import glob
import sys
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
from os import listdir
from functools import reduce

In [2]:
%load_ext memory_profiler

In [3]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want

Next, we download the data:

In [5]:
# make directory if missing
os.makedirs(output_directory, exist_ok=True)

# download missing files
files_to_dl = ["data.zip"]
for item in filter(lambda x: x['name'] in files_to_dl, files):
    filename = os.path.join(output_directory, item["name"])
    if not os.path.isfile(filename):
        urlretrieve(item["download_url"], filename)

In [6]:
# list all CSVs
csvs = glob.glob(output_directory + '*.csv')

# As per Tom's guidance, we can exclude the annoying CSV that is formatted differently
csvs = [x for x in csvs if "observed" not in x]

In [7]:
%%time
%%memit

# define a parser to extract the model name
def parser(path):
    file = os.path.split(path)[1]
    return file.split('_daily')[0]

# read-in with dask
dat = dd.read_csv(csvs, include_path_column = "model",
                  converters = {"model": parser})

peak memory: 107.03 MiB, increment: 6.53 MiB
CPU times: user 83.8 ms, sys: 38.2 ms, total: 122 ms
Wall time: 1.34 s


In [8]:
%%time
%%memit

dat.head()

peak memory: 374.64 MiB, increment: 267.60 MiB
CPU times: user 659 ms, sys: 153 ms, total: 813 ms
Wall time: 1.31 s


In [9]:
%%time
%%memit

len(dat.index)

peak memory: 3197.69 MiB, increment: 2823.05 MiB
CPU times: user 1min 13s, sys: 13.3 s, total: 1min 27s
Wall time: 25.5 s


In [10]:
%%time
%%memit

res = (dat
       .groupby('model')['rain (mm/day)']
       .agg(["mean", "std"])
       .sort_values("mean")
       .reset_index()
       .compute()
      )
print(res)

               model      mean       std
0      MPI-ESM1-2-HR  0.995569  4.083814
1      MPI-ESM1-2-LR  1.074308  3.911700
2          KIOST-ESM  1.102353  3.852051
3         MRI-ESM2-0  1.368030  4.517987
4           GFDL-CM4  1.414485  5.024926
5   EC-Earth3-Veg-LR  1.516258  4.714335
6    MPI-ESM-1-2-HAM  1.610720  4.885519
7              NESM3  1.621936  4.971972
8        FGOALS-f3-L  1.627373  5.747396
9         ACCESS-CM2  1.787025  5.914188
10          BCC-ESM1  1.811032  5.358361
11           CanESM5  1.894328  5.835787
12       BCC-CSM2-MR  1.951832  6.200969
13    AWI-ESM-1-1-LR  2.026071  5.321889
14         FGOALS-g3  2.156419  6.015488
15       SAM0-UNICON  2.169676  6.383241
16     ACCESS-ESM1-5  2.217501  6.422397
17           TaiESM1  2.224576  5.886578
18        NorESM2-LM  2.230799  5.681562
19        NorESM2-MM  2.232966  6.151688
20         CMCC-ESM2  2.266125  5.538429
21      CMCC-CM2-HR4  2.279350  5.629965
22            MIROC6  2.301662  6.393745
23      CMCC-CM2