* Import
by Rick Bischoff 7/18/2020

The files came in an ordered directory hierarchy in NetCDF4 format.  The code below leverages this hierarchy to traverse the directory structure and restate the files in a format that is easily parseable by javascript.

Specifically, the raw data is in the format (in directory ../raw/):

    <cropName>/yield_<year>.nc4
    
and the data inside the NetCDF file is in the format

    var[lat, lon] = yield
    
We will convert all of these files into one massive csv file that looks like this:

    crop,year,lat,lon,yield

and also will remove missing values in the process.

In [1]:
from netCDF4 import Dataset
from itertools import repeat
import re
import pandas as pd

In [2]:
files = ["../raw/wheat_winter/yield_2016.nc4"]

In [3]:
files = []
import os
import sys

walk_dir = "../raw/"

for root, subdirs, _files in os.walk(walk_dir):
    for filename in _files:
        if filename.find("nc4") != -1:
            files.append(os.path.join(root, filename))

print("{0} files found.".format(len(files)))

360 files found.


In [4]:
fileparse = re.compile(r"\.\./raw/(\w+)/([A-Za-z]+)_(\d+)\.nc4")

crops = []
years = []
variables = []
lats = [] # do you even lift
lons = []
values = []

for f in files:
    (crop, var, year) = fileparse.match(f).groups()

    rootgrp = Dataset(f, "r")
    lat_array = rootgrp.variables["lat"][:].data
    lat_N = lat_array.shape[0]
    lon_array = rootgrp.variables["lon"][:].data
    lon_N = lon_array.shape[0]
    val_tmp = rootgrp.variables["var"][:]
    missing = val_tmp.fill_value
    val_array = val_tmp.data
    
    rows_added = 0
    rows_skip = 0
    for lat in range(lat_N):
        for lon in range(lon_N):
            if val_array[lat, lon] == missing:
                rows_skip += 1
            else:
                rows_added += 1
                values.append(val_array[lat][lon])
                lats.append(lat_array[lat])
                lons.append(lon_array[lon])

    print("{0}\trows {1}\tskip {2}".format((crop,var,year), rows_added, rows_skip))
    crops.extend(repeat(crop, rows_added))
    years.extend(repeat(year, rows_added))
    variables.extend(repeat(var, rows_added))

data = pd.DataFrame({
    "crops": crops,
    "years": years,
    "measure": variables,
    "lat": lats,
    "lon": lons,
    "value": values
})
data.to_csv("../data.csv")

('maize', 'yield', '1981')	rows 5228	skip 253972
('maize', 'yield', '1982')	rows 15032	skip 244168
('maize', 'yield', '1983')	rows 15034	skip 244166
('maize', 'yield', '1984')	rows 15027	skip 244173
('maize', 'yield', '1985')	rows 15032	skip 244168
('maize', 'yield', '1986')	rows 15040	skip 244160
('maize', 'yield', '1987')	rows 15075	skip 244125
('maize', 'yield', '1988')	rows 15087	skip 244113
('maize', 'yield', '1989')	rows 15072	skip 244128
('maize', 'yield', '1990')	rows 15078	skip 244122
('maize', 'yield', '1991')	rows 15087	skip 244113
('maize', 'yield', '1992')	rows 15090	skip 244110
('maize', 'yield', '1993')	rows 14828	skip 244372
('maize', 'yield', '1994')	rows 15080	skip 244120
('maize', 'yield', '1995')	rows 15081	skip 244119
('maize', 'yield', '1996')	rows 15077	skip 244123
('maize', 'yield', '1997')	rows 15080	skip 244120
('maize', 'yield', '1998')	rows 15099	skip 244101
('maize', 'yield', '1999')	rows 15083	skip 244117
('maize', 'yield', '2000')	rows 15079	skip 244121
(