<a id='dt'></a>
# data transformation

[libraries](#dt-libraries)  
[parameters](#dt-parameters)  
[directories](#dt-directories)  
[paths](#dt-paths)  
[read](#dt-read)  
[concatenate](#dt-concatenate)  
[reset index](#dt-resetindex)  
[reshape](#dt-reshape)  
[columns](#dt-columns)  
[export](#dt-export)  

<a id='dt-libraries'></a>
## libraries

[Return to Start of Notebook](#dt)  

In [1]:
import os
import glob
#import gzip
import pandas as pd
#import xarray as xr

In [2]:
import warnings # to ignore warnings
from functools import partial # for shorter traceback error messages
from IPython.core.interactiveshell import InteractiveShell # to automtically show cell output

In [3]:
#from data_transformation import read_csv
#from data_transformation import reshape_raw_data

<a id='dt-parameters'></a>
## parameters

[Return to Start of Notebook](#dt)  

In [4]:
# ignore warnings 
warnings.filterwarnings('ignore')

In [5]:
# shorter traceback error messages
get_ipython().showtraceback = partial(get_ipython().showtraceback,exception_only=True)

In [6]:
# shows result of cell without needing print
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

<a id='dt-directories'></a>
## directories

[Return to Start of Notebook](#dt)  

In [7]:
project_id = 'kr'
home_dir = os.path.expanduser("~")
project_dir = os.path.join(home_dir, project_id,  'project')
data_dir = os.path.join(project_dir, 'data')
raw_data_dir = os.path.join(data_dir, 'raw-data')
transformed_data_dir = os.path.join(data_dir, 'transformed-data')

'/Users/rkforest/kr/project/data/transformed-data'

In [8]:
def create_directory(directory_name):
    if not os.path.exists(directory_name):
        print('Created new directory: ', directory_name)
        os.mkdir(directory_name)

In [9]:
create_directory(transformed_data_dir)

In [10]:
os.chdir(project_dir)
os.getcwd()

'/Users/rkforest/kr/project'

<a id='dt-paths'></a>
## paths

[Return to Start of Notebook](#dt)  

In [11]:
file_paths_csv =  sorted(glob.glob(os.path.join(raw_data_dir, '*.csv')))
[print(path) for path in file_paths_csv];

/Users/rkforest/kr/project/data/raw-data/GLB.Ts+dSST.csv
/Users/rkforest/kr/project/data/raw-data/NH.Ts+dSST.csv
/Users/rkforest/kr/project/data/raw-data/SH.Ts+dSST.csv
/Users/rkforest/kr/project/data/raw-data/ZonAnn.Ts+dSST.csv


In [12]:
file_paths_nc  =  sorted(glob.glob(os.path.join(raw_data_dir, '*.nc')))
[print(path) for path in file_paths_nc];

/Users/rkforest/kr/project/data/raw-data/gistemp1200_GHCNv4_ERSSTv5.nc


<a id='dt-read'></a>
## read

[Return to Start of Notebook](#dt)  

In [13]:
def read_csv_all(file_id, file_path):
   
    df = pd.read_csv(file_path, skiprows=1) # initial read for columns
    df_cols = list(df.columns)
        
    # months Jan thru Dec
    cols_to_import  = df_cols[:14]
    # seasons
    cols_to_import.extend(df_cols[15:])
        
    df = pd.read_csv(
            file_path,
            usecols=cols_to_import,   
            parse_dates=['Year'],  
            skiprows=1,
            na_values=['***']
    )
        
    df['area'] = file_id
    
    return df

In [14]:
dfg = read_csv_all('GLB', file_paths_csv[0])
dfn = read_csv_all('NH', file_paths_csv[1])
dfs = read_csv_all('SH', file_paths_csv[2])
dfg.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,DJF,MAM,JJA,SON,area
0,1880-01-01,-0.18,-0.24,-0.08,-0.16,-0.1,-0.21,-0.18,-0.1,-0.14,-0.23,-0.21,-0.17,-0.17,,-0.11,-0.16,-0.2,GLB


In [15]:
dfz = pd.read_csv(file_paths_csv[3])
dfz.head(1)

Unnamed: 0,Year,Glob,NHem,SHem,24N-90N,24S-24N,90S-24S,64N-90N,44N-64N,24N-44N,EQU-24N,24S-EQU,44S-24S,64S-44S,90S-64S
0,1880,-0.17,-0.29,-0.04,-0.37,-0.12,-0.02,-0.8,-0.48,-0.29,-0.16,-0.09,-0.04,0.05,0.66


<a id='dt-concatenate'></a>
## concatenate

[Return to Start of Notebook](#dt)  

In [16]:
dfg_concat = pd.concat([dfg,dfn,dfs])
dfg_concat.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,DJF,MAM,JJA,SON,area
0,1880-01-01,-0.18,-0.24,-0.08,-0.16,-0.1,-0.21,-0.18,-0.1,-0.14,-0.23,-0.21,-0.17,-0.17,,-0.11,-0.16,-0.2,GLB


<a id='dt-resetindex'></a>
## reset index

[Return to Start of Notebook](#dt)  

In [17]:
dfg_concat = dfg_concat.reset_index(drop=True)
dfg_concat.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,DJF,MAM,JJA,SON,area
0,1880-01-01,-0.18,-0.24,-0.08,-0.16,-0.1,-0.21,-0.18,-0.1,-0.14,-0.23,-0.21,-0.17,-0.17,,-0.11,-0.16,-0.2,GLB


<a id='dt-reshape'></a>
## reshape

[Return to Start of Notebook](#dt)  

In [18]:
dfg_transform = pd.melt(dfg_concat,
                        id_vars=['area','Year'],
                        var_name='period', 
                value_name='anomaly')
dfg_transform.head(1)

Unnamed: 0,area,Year,period,anomaly
0,GLB,1880-01-01,Jan,-0.18


In [19]:
dfz_transform = pd.melt(dfz,
                id_vars=['Year'],
                var_name='area', 
                value_name='anomaly')
dfz_transform.head(1)

Unnamed: 0,Year,area,anomaly
0,1880,Glob,-0.17


<a id='dt-columns'></a>
## columns

[Return to Start of Notebook](#dt)  

#### add

In [20]:
dfg_transform['year'] = dfg_transform['Year'].dt.year   

dfg_transform.head(1)

Unnamed: 0,area,Year,period,anomaly,year
0,GLB,1880-01-01,Jan,-0.18,1880


#### drop

In [21]:
dfg_transform.drop(columns=['Year'],inplace=True) 
dfg_transform.head(1)

Unnamed: 0,area,period,anomaly,year
0,GLB,Jan,-0.18,1880


#### reorder

In [22]:
dfg_transform = dfg_transform.reindex(['year','area','period','anomaly'], axis=1)
dfg_transform.head(1)

Unnamed: 0,year,area,period,anomaly
0,1880,GLB,Jan,-0.18


#### rename

In [23]:
col_names = {'Year':'year'}
dfz_transform.rename(columns=col_names,inplace=True)
dfz_transform.head(1)

Unnamed: 0,year,area,anomaly
0,1880,Glob,-0.17


<a id='dt-export'></a>
## export

[Return to Start of Notebook](#dt)  

In [24]:
def export_file(file_id, df):
    save_path = os.path.join(transformed_data_dir,
                             file_id + '.csv')
    df.to_csv(save_path,
              header=True,
              index=False,
              sep=',',
              na_rep='NULL')

In [25]:
export_file('global_transformed_data',dfg_transform)

In [26]:
export_file('zonal_transformed_data',dfz_transform)