## Method 1: Pandas from ZIP Directly

All code below is referenced from Lecture_1_2.ipynb provided by Gittu George for DSCI 525

In [1]:
# import packages

import dask.dataframe as dd
import re
import os
import glob
import sys
import zipfile
import requests
from urllib.request import urlretrieve
import json
import pandas as pd
import numpy as np
from memory_profiler import memory_usage
from os import listdir

In [2]:
%load_ext memory_profiler

In [3]:
# Necessary metadata
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figshareairline/"

In [4]:
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want

Next, we download the data:

In [5]:
# make directory if missing
os.makedirs(output_directory, exist_ok=True)

# download missing files
files_to_dl = ["data.zip"]
for item in filter(lambda x: x['name'] in files_to_dl, files):
    filename = os.path.join(output_directory, item["name"])
    if not os.path.isfile(filename):
        urlretrieve(item["download_url"], filename)

In [6]:
mem = dict()

In [7]:
# open a read-only connection to zip file
zfile = zipfile.ZipFile(glob.glob(output_directory + "*.zip")[0], "r")

# list non-hidden files in zip
z_csvs = list(filter(lambda x: not x.startswith("__"), zfile.namelist()))
z_csvs = [x for x in z_csvs if "observed" not in x]

In [8]:
%%time
%%memit

# create a dictionary of dataframes
dat = {x.split('_daily')[0]: pd.read_csv(zfile.open(x)) for x in z_csvs}

# reshape to one big dataframe
dat = pd.concat(dat, names=["model", "row"])

peak memory: 10923.11 MiB, increment: 10824.04 MiB
CPU times: user 1min 6s, sys: 6.19 s, total: 1min 12s
Wall time: 1min 14s


In [14]:
%%time
%%memit

target = output_directory + "combo_data.csv"
if not os.path.isfile(target):
    # write an empty dataframe to CSV
    cols = ["model", "time", "lat_min", "lat_max", "lon_min", "lon_min", "rain (mm/day)"]
    pd.DataFrame(columns=cols).to_csv(target)

    # populate the combo file with ZIP contents
    for csv in z_csvs:
        df = pd.read_csv(zfile.open(csv))
        df["model"] = csv.split('_daily')[0]
        df[cols].to_csv(target, mode='a', header=False)
        del df

Exception ignored in: <function tqdm.__del__ at 0x18f10fa60>
Traceback (most recent call last):
  File "/Users/raf/opt/miniconda3/envs/525/lib/python3.9/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/Users/raf/opt/miniconda3/envs/525/lib/python3.9/site-packages/tqdm/notebook.py", line 278, in close
    self.disp(bar_style='success')
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


peak memory: 1963.16 MiB, increment: 1881.55 MiB
CPU times: user 7min 52s, sys: 22.1 s, total: 8min 14s
Wall time: 8min 30s


In [9]:
assert dat.shape[0] == 62467843

In [10]:
dat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day)
model,row,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MPI-ESM-1-2-HAM,0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13
MPI-ESM-1-2-HAM,1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13
MPI-ESM-1-2-HAM,2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13
MPI-ESM-1-2-HAM,3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13
MPI-ESM-1-2-HAM,4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13
