## Method 5: Only Load Wanted Columns

All code below is referenced from Lecture_1_2.ipynb provided by Gittu George for DSCI 525

In [1]:
# import packages

import dask.dataframe as dd
import re
import os
import glob
import sys
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
from os import listdir
from functools import reduce

In [2]:
%load_ext memory_profiler

In [3]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want

Next, we download the data:

In [5]:
# make directory if missing
os.makedirs(output_directory, exist_ok=True)

# download missing files
files_to_dl = ["data.zip"]
for item in filter(lambda x: x['name'] in files_to_dl, files):
    filename = os.path.join(output_directory, item["name"])
    if not os.path.isfile(filename):
        urlretrieve(item["download_url"], filename)

In [6]:
# list all CSVs
csvs = glob.glob(output_directory + '*.csv')

# As per Tom's guidance, we can exclude the annoying CSV that is formatted differently
csvs = [x for x in csvs if "observed" not in x]

In [7]:
# define the dtypes
colspec = {"time": "str",
           "lat_min": np.float32,
           "lat_max": np.float32,
           "lon_min": np.float32,
           "lon_max": np.float32,
           "rain (mm/day)": np.float32}

In [8]:
usecols = ["time", "rain (mm/day)"]

In [9]:
%%time
%%memit

# create a dictionary of dataframes
dat = {x.split('_daily')[0]: pd.read_csv(x, dtype=colspec, parse_dates=["time"], usecols=usecols) for x in csvs}

# reshape to one big dataframe
dat = pd.concat(dat, names=["model", "row"])

peak memory: 4793.13 MiB, increment: 4693.91 MiB
CPU times: user 46.3 s, sys: 4.73 s, total: 51 s
Wall time: 51.8 s


In [10]:
dat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,time,rain (mm/day)
model,row,Unnamed: 2_level_1,Unnamed: 3_level_1
figshareairline/MPI-ESM-1-2-HAM,0,1889-01-01 12:00:00,4.244226e-13
figshareairline/MPI-ESM-1-2-HAM,1,1889-01-02 12:00:00,4.217326e-13
figshareairline/MPI-ESM-1-2-HAM,2,1889-01-03 12:00:00,4.498125e-13
figshareairline/MPI-ESM-1-2-HAM,3,1889-01-04 12:00:00,4.251282e-13
figshareairline/MPI-ESM-1-2-HAM,4,1889-01-05 12:00:00,4.270161e-13


In [None]:
assert dat.shape[0] == 62467843