<a id='dd'></a>
# data download

[download](#dd-download)  
[catalog](#dd-catalog)  
[paths](#dd-paths)   

In [1]:
project_id = 'gta'
#project_id = 'ea'

## libraries

In [2]:
import os
import shutil
import glob
import pooch
import earthpy as et
import pandas as pd
import xarray as xr

In [3]:
import warnings # to ignore warnings
from functools import partial # for shorter traceback error messages
from IPython.core.interactiveshell import InteractiveShell # to automtically show cell output

### parameters

In [4]:
# ignore warnings 
warnings.filterwarnings('ignore')

In [5]:
# shorter traceback error messages
get_ipython().showtraceback = partial(get_ipython().showtraceback,exception_only=True)

In [6]:
# shows result of cell without needing print
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

## directories

In [7]:
home_dir = os.path.expanduser("~")
project_dir = os.path.join(home_dir, project_id )
data_dir = os.path.join(project_dir, 'data')

'/Users/rkforest/gta/data'

In [8]:
if project_id == 'ea':
    raw_data_dir = os.path.join(home_dir,'earth-analytics','data','earthpy-downloads')
else:
    raw_data_dir = os.path.join(project_dir, data_dir, 'raw-data')
raw_data_dir

'/Users/rkforest/gta/data/raw-data'

In [9]:
pooch_cache_dir = pooch.os_cache(project_id)

PosixPath('/Users/rkforest/Library/Caches/gta')

In [10]:
def create_directory(directory_name):
    if not os.path.exists(directory_name):
        print('Created new directory: ', directory_name)
        os.mkdir(directory_name)

In [11]:
create_directory(project_dir)
create_directory(data_dir)
create_directory(raw_data_dir)

In [12]:
os.chdir(project_dir)
os.getcwd()

'/Users/rkforest/gta'

<a id='env-download'></a>
## download

[Return to Start of Notebook](#env)  

### global temperature anomaly data

In [13]:
if project_id == 'gta':  
    
    file_names_gta = ["gistemp1200_GHCNv4_ERSSTv5.nc.gz",
                      "GLB.Ts+dSST.csv",
                      "NH.Ts+dSST.csv",
                      "SH.Ts+dSST.csv",
                      "ZonAnn.Ts+dSST.csv"] 
    
    hash_dict = {}
    for file_name in file_names_gta:
        hash_dict[file_name] = pooch.file_hash(os.path.join(pooch_cache_dir, file_name))
    
    gta_pooch = pooch.create(
        path=pooch_cache_dir,
        base_url="https://data.giss.nasa.gov/gistemp/tabledata_v4/",
        registry=hash_dict,
        urls={file_names_gta[0]:"https://data.giss.nasa.gov/gistemp/tabledata_v4/"})
    
    for index, file_name in enumerate(file_names_gta):   
        if index == 0:
            cache_file_path = gta_pooch.fetch(file_name,processor=pooch.Decompress())   
            save_file_path = os.path.join(raw_data_dir, file_name).rstrip('.gz')
        else:
            cache_file_path = gta_pooch.fetch(file_name)
            save_file_path = os.path.join(raw_data_dir, file_name)
           
        if (not os.path.exists(save_file_path)) or\
           (pooch.file_hash(cache_file_path) != pooch.file_hash(save_file_path)):
            shutil.copy2(cache_file_path, save_file_path)


<a id='env-paths'></a>
## paths

[Return to Start of Notebook](#env)  

In [14]:
raw_data_paths_nc  =  sorted(glob.glob(os.path.join(raw_data_dir, '*.nc*')))
raw_data_paths_nc

['/Users/rkforest/gta/data/raw-data/gistemp1200_GHCNv4_ERSSTv5.nc']

In [15]:
raw_data_paths_csv =  sorted(glob.glob(os.path.join(raw_data_dir, '*dSST.csv')))
raw_data_paths_csv

['/Users/rkforest/gta/data/raw-data/GLB.Ts+dSST.csv',
 '/Users/rkforest/gta/data/raw-data/NH.Ts+dSST.csv',
 '/Users/rkforest/gta/data/raw-data/SH.Ts+dSST.csv',
 '/Users/rkforest/gta/data/raw-data/ZonAnn.Ts+dSST.csv']

<a id='env-atalog'></a>
## catalog

[Return to Start of Notebook](#env)  

In [16]:
files = []
dims = []
for path in raw_data_paths_nc:
    file_name = os.path.basename(path)
    files.append(file_name)    
    ds = xr.open_dataset(path)
    da = ds['tempanomaly']
    dims.append(da.dims) 
    
#    data = {'directory' : raw_data_dir,
#            'dimensions' : dims   
#    }  

df = pd.DataFrame(index = files, 
                  data = {'dimensions':dims}  
                 )

df.index.name = 'file'
df

Unnamed: 0_level_0,dimensions
file,Unnamed: 1_level_1
gistemp1200_GHCNv4_ERSSTv5.nc,"(time, lat, lon)"


In [17]:
files=[]
rows=[]
cols=[]

for idx, path in enumerate(raw_data_paths_csv):
    file_name = os.path.basename(path)
    files.append(file_name)    
    if idx == 3:
        df = pd.read_csv(path, skiprows=0)
    else:
        df = pd.read_csv(path, skiprows=1)   
    rows.append(df.shape[0])
    cols.append(df.shape[1])
    
df = pd.DataFrame(index=files,
                  data={'rows': rows,'cols': cols}
                 )
df.index.name = 'file'
df

Unnamed: 0_level_0,rows,cols
file,Unnamed: 1_level_1,Unnamed: 2_level_1
GLB.Ts+dSST.csv,144,19
NH.Ts+dSST.csv,144,19
SH.Ts+dSST.csv,144,19
ZonAnn.Ts+dSST.csv,143,15
