<a id='dd'></a>
# data download

[libraries](#dd-libraries)  
[parameters](#dd-parameters)  
[directories](#dd-directories)  
[download](#dd-download)  
[catalog](#dd-catalog)  
[paths](#dd-paths)  
[read](#dd-read)   

<a id='dd-libraries'></a>
## libraries

[Return to Start of Notebook](#dd)  

In [1]:
import os
import glob
import gzip
import pandas as pd
import xarray as xr

In [2]:
import warnings 
from functools import partial 
from IPython.core.interactiveshell import InteractiveShell 

In [3]:
from data_download import download_csv
from data_download import download_nc
from data_download import directory_catalog
from data_download import read_raw_data

<a id='dd-parameters'></a>
## parameters

[Return to Start of Notebook](#dd)  

In [4]:
# ignore warnings 
warnings.filterwarnings('ignore')

In [5]:
# shorter traceback error messages
get_ipython().showtraceback = partial(get_ipython().showtraceback,exception_only=True)

In [6]:
# shows result of cell without needing print
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

<a id='dd-directories'></a>
## directories

[Return to Start of Notebook](#dd)  

In [7]:
project_id = 'kr'
home_dir = os.path.expanduser("~")
project_dir = os.path.join(home_dir, project_id,  'project')
data_dir = os.path.join(project_dir, 'data')
raw_data_dir =os.path.join(data_dir, 'raw-data')

'/Users/rkforest/kr/project/data/raw-data'

In [8]:
def create_directory(directory_name):
    if not os.path.exists(directory_name):
        print('Created new directory: ', directory_name)
        os.mkdir(directory_name)

In [9]:
create_directory(project_dir)
create_directory(data_dir)
create_directory(raw_data_dir)

In [10]:
os.chdir(project_dir)
os.getcwd()

'/Users/rkforest/kr/project'

<a id='dd-download'></a>
## download 

[Return to Start of Notebook](#dd)  

#### download csv

In [11]:
download_csv(raw_data_dir, 'GLB')
download_csv(raw_data_dir, 'NH')
download_csv(raw_data_dir, 'SH')
download_csv(raw_data_dir, 'ZonAnn')

current file is latest 2023-04-12
current file is latest 2023-04-12
current file is latest 2023-04-12
current file is latest 2023-04-12


#### download netcdf

In [12]:
download_nc(raw_data_dir)

current file is latest 2023-04-12


<a id='dd-catalog'></a>
## catalog

[Return to Start of Notebook](#dd)  

In [13]:
df = directory_catalog(raw_data_dir, 'csv')

Unnamed: 0,directory,file,"rows, cols"
0,raw-data,GLB.Ts+dSST.csv,"(144, 19)"
1,raw-data,NH.Ts+dSST.csv,"(144, 19)"
2,raw-data,SH.Ts+dSST.csv,"(144, 19)"
3,raw-data,ZonAnn.Ts+dSST.csv,"(142, 15)"


In [14]:
df = directory_catalog(raw_data_dir, 'nc')

Unnamed: 0,directory,file,dimensions
0,raw-data,gistemp1200_GHCNv4_ERSSTv5.nc,"(time, lat, lon)"


<a id='dd-paths'></a>
## paths

[Return to Start of Notebook](#dd)  

In [15]:
file_paths_csv =  sorted(glob.glob(os.path.join(raw_data_dir, '*.csv')))
[print(path) for path in file_paths_csv];

/Users/rkforest/kr/project/data/raw-data/GLB.Ts+dSST.csv
/Users/rkforest/kr/project/data/raw-data/NH.Ts+dSST.csv
/Users/rkforest/kr/project/data/raw-data/SH.Ts+dSST.csv
/Users/rkforest/kr/project/data/raw-data/ZonAnn.Ts+dSST.csv


In [16]:
file_paths_nc  =  sorted(glob.glob(os.path.join(raw_data_dir, '*.nc')))
[print(path) for path in file_paths_nc];

/Users/rkforest/kr/project/data/raw-data/gistemp1200_GHCNv4_ERSSTv5.nc


<a id='dd-read'></a>
## read

[Return to Start of Notebook](#dd)  

#### csv

In [17]:
dfg = read_raw_data(file_paths_csv[0], skiprows=1);
dfn = read_raw_data(file_paths_csv[1], skiprows=1);
dfs = read_raw_data(file_paths_csv[2], skiprows=1);
dfz = read_raw_data(file_paths_csv[3]);

In [18]:
dfg.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,-0.18,-0.24,-0.08,-0.16,-0.1,-0.21,-0.18,-0.1,-0.14,-0.23,-0.21,-0.17,-0.17,***,***,-0.11,-0.16,-0.2


In [19]:
dfn.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,-0.36,-0.51,-0.23,-0.3,-0.06,-0.16,-0.19,-0.27,-0.23,-0.32,-0.43,-0.4,-0.29,***,***,-0.2,-0.21,-0.33


In [20]:
dfs.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,0.01,0.04,0.06,-0.01,-0.12,-0.25,-0.17,0.07,-0.04,-0.14,0.0,0.06,-0.04,***,***,-0.02,-0.11,-0.06


In [21]:
dfz.head(1)

Unnamed: 0,Year,Glob,NHem,SHem,24N-90N,24S-24N,90S-24S,64N-90N,44N-64N,24N-44N,EQU-24N,24S-EQU,44S-24S,64S-44S,90S-64S
0,1880,-0.17,-0.29,-0.04,-0.37,-0.12,-0.02,-0.8,-0.48,-0.29,-0.16,-0.09,-0.04,0.05,0.66


#### netcdf

In [23]:
gz = gzip.open(file_paths_nc[0], 'rb')
ds = xr.open_dataset(gz)