<a id='pd'></a>  
# Pandas DataFrames

[Import](#pd-import)  
[Restructure](#pd-restructure)  
[Export](#pd-export)  

### libraries

In [1]:
import os
import calendar
import warnings
import requests
import pandas as pd 
from pandas.api.types import CategoricalDtype
#pd.__version__

In [2]:
# shorter traceback error messages
from functools import partial
get_ipython().showtraceback = partial(get_ipython().showtraceback,exception_only=True)

### directories

In [3]:
home_dir = home_dir = os.path.expanduser("~")
work_dir = os.path.join(home_dir, 'kr', 'df')
data_dir = os.path.join(home_dir, 'kr', 'df', 'data')
save_dir = os.path.join(home_dir, 'kr', 'df', 'save')

In [4]:
def create_directory(directory_name):
    if not os.path.exists(directory_name):
        os.mkdir(directory_name)

In [5]:
create_directory(work_dir)
create_directory(data_dir)
create_directory(save_dir)

In [6]:

os.chdir(work_dir)
os.getcwd()

'/Users/rkforest/kr/df'

### parameters

In [7]:
warnings.filterwarnings('ignore')

In [8]:
# shows result of cell without needing print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

### constants

<a id='pd-import'></a>
## Import

[Return to Start of Notebook](#pd)  

- skiprows  
- sep  
- usecols  
- parse_dates  
- index_col (can also use integer of column)
- na_values  
- nrows  

### url

In [9]:
giss_url = "https://data.giss.nasa.gov/gistemp/tabledata_v4/"

'https://data.giss.nasa.gov/gistemp/tabledata_v4/'

### global monthly

In [10]:
file_id = "GLB"
file_name = file_id + ".Ts+dSST.csv"
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/GLB.Ts+dSST.csv'

In [11]:
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/GLB.Ts+dSST.csv'

In [12]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

12594

In [13]:
dfg = pd.read_csv(file_path, skiprows=1) # initial read for columns
dfg.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,-0.18,-0.24,-0.09,-0.16,-0.1,-0.21,-0.18,-0.1,-0.15,-0.23,-0.22,-0.17,-0.17,***,***,-0.11,-0.16,-0.2


In [14]:
cols_to_import = dfg.columns[0:13]; # start to end+1
dfg = pd.read_csv(file_path,
                  skiprows=1,
                  sep=',',
                  usecols=cols_to_import,
                  parse_dates=['Year'],
                  #index_col=0, # can use either  = 0, = 'Year'
                  na_values=['***'],
                  nrows=150)
dfg = dfg.rename(columns={'Year': 'period'})
dfg['zone'] = 1 # will become Global in DB
dfg.head(1)

Unnamed: 0,period,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,zone
0,1880-01-01,-0.18,-0.24,-0.09,-0.16,-0.1,-0.21,-0.18,-0.1,-0.15,-0.23,-0.22,-0.17,1


### northern hemisphere monthly

In [15]:
file_id = "NH"
file_name = file_id + ".Ts+dSST.csv"
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path


'/Users/rkforest/kr/df/data/NH.Ts+dSST.csv'

In [16]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

12633

In [17]:
dfn = pd.read_csv(file_path, skiprows=1) # initial read for columns
dfn.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,-0.35,-0.51,-0.22,-0.29,-0.06,-0.16,-0.18,-0.26,-0.23,-0.32,-0.43,-0.4,-0.28,***,***,-0.19,-0.2,-0.33


In [18]:
cols_to_import = dfn.columns[0:13]; # start to end+1
dfn = pd.read_csv(file_path,
                  skiprows=1,
                  sep=',',
                  usecols=cols_to_import,
                  parse_dates=['Year'],
                  #index_col=0, # can use either  = 0, = 'Year'
                  na_values=['***'],
                  nrows=150)
dfn = dfn.rename(columns={'Year': 'period'})
dfn['zone'] = 2 # will become NH in DB
dfn.head(1)

Unnamed: 0,period,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,zone
0,1880-01-01,-0.35,-0.51,-0.22,-0.29,-0.06,-0.16,-0.18,-0.26,-0.23,-0.32,-0.43,-0.4,2


### southern hemisphere monthly

In [19]:
file_id = "SH"
file_name = file_id + ".Ts+dSST.csv"
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/SH.Ts+dSST.csv'

In [20]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

12630

In [21]:
dfs = pd.read_csv(file_path, skiprows=1) # initial read for columns
dfs.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,0.0,0.03,0.05,-0.02,-0.13,-0.25,-0.18,0.06,-0.05,-0.15,-0.01,0.05,-0.05,***,***,-0.03,-0.12,-0.07


In [22]:

cols_to_import = dfs.columns[0:13]; # start to end+1
dfs = pd.read_csv(file_path,
                  skiprows=1,
                  sep=',',
                  usecols=cols_to_import,
                  parse_dates=['Year'],
                  #index_col=0, # can use either  = 0, = 'Year'
                  na_values=['***'],
                  nrows=150)
dfs = dfs.rename(columns={'Year': 'period'})
#dfs['zone'] = file_id[0]
dfs['zone'] = 3 # will become SH in DB
dfs.head(1)

Unnamed: 0,period,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,zone
0,1880-01-01,0.0,0.03,0.05,-0.02,-0.13,-0.25,-0.18,0.06,-0.05,-0.15,-0.01,0.05,3


### zonal annual

In [23]:
file_name = 'ZonAnn.Ts+dSST.csv'
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/ZonAnn.Ts+dSST.csv'

In [24]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

9986

In [25]:
dfz = pd.read_csv(file_path, skiprows=0) # initial read for columns
dfz.head(1)

Unnamed: 0,Year,Glob,NHem,SHem,24N-90N,24S-24N,90S-24S,64N-90N,44N-64N,24N-44N,EQU-24N,24S-EQU,44S-24S,64S-44S,90S-64S
0,1880,-0.17,-0.28,-0.05,-0.37,-0.13,-0.02,-0.81,-0.46,-0.28,-0.16,-0.11,-0.05,0.05,0.66


In [26]:
cols_to_import=['Year',
                'EQU-24N','24N-44N','44N-64N','64N-90N',
                '24S-EQU','44S-24S','64S-44S','90S-64S']
dfz = pd.read_csv(file_path, usecols=cols_to_import)
dfz.head(1)

Unnamed: 0,Year,64N-90N,44N-64N,24N-44N,EQU-24N,24S-EQU,44S-24S,64S-44S,90S-64S
0,1880,-0.81,-0.46,-0.28,-0.16,-0.11,-0.05,0.05,0.66


<a id='pd-restructure'></a>
## Restructure

[Return to Start of Notebook](#pd)  

### .melt

In [27]:
dfg.head(2)

Unnamed: 0,period,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,zone
0,1880-01-01,-0.18,-0.24,-0.09,-0.16,-0.1,-0.21,-0.18,-0.1,-0.15,-0.23,-0.22,-0.17,1
1,1881-01-01,-0.19,-0.14,0.03,0.05,0.06,-0.18,0.0,-0.03,-0.15,-0.22,-0.18,-0.07,1


In [28]:
dfgt = pd.melt(dfg,
               id_vars=['zone','period'],
               var_name=['mthcode'],
               value_name='deviation')
dfgt.head(2)

Unnamed: 0,zone,period,mthcode,deviation
0,1,1880-01-01,Jan,-0.18
1,1,1881-01-01,Jan,-0.19


In [29]:
dfnt = pd.melt(dfn,
               id_vars=['zone','period'],
               var_name=['mthcode'],
               value_name='deviation')
dfnt.head(1)

Unnamed: 0,zone,period,mthcode,deviation
0,2,1880-01-01,Jan,-0.35


In [30]:
dfst = pd.melt(dfs,
               id_vars=['zone','period'],
               var_name=['mthcode'],
               value_name='deviation')
dfst.head(1)

Unnamed: 0,zone,period,mthcode,deviation
0,3,1880-01-01,Jan,0.0


In [31]:
dfzt = pd.melt(dfz,
             id_vars=['Year'],
             var_name=['zone'],
             value_name='deviation')
dfzt.head(1)

Unnamed: 0,Year,zone,deviation
0,1880,64N-90N,-0.81


### create date with pd.to_datetime
- using year and month strings 

In [32]:
dfgt['year_str'] = dfgt['period'].dt.strftime('%Y') # create year string
dfgt['month_str'] = pd.to_datetime(dfgt['mthcode'], format='%b').dt.strftime('%m') # create month string
dfgt['period'] = pd.to_datetime(dfgt['year_str'] + dfgt['month_str'], format='%Y%m') # combine strings and convert to date

In [33]:
dfgt.tail(2)

Unnamed: 0,zone,period,mthcode,deviation,year_str,month_str
1726,1,2022-12-01,Dec,0.81,2022,12
1727,1,2023-12-01,Dec,,2023,12


In [34]:
dfnt['year_str'] = dfnt['period'].dt.strftime('%Y') # create year string
dfnt['month_str'] = pd.to_datetime(dfnt['mthcode'], format='%b').dt.strftime('%m') # create month string
dfnt['period'] = pd.to_datetime(dfnt['year_str'] + dfnt['month_str'], format='%Y%m') # combine strings and convert to date

In [35]:
dfnt.tail(2)

Unnamed: 0,zone,period,mthcode,deviation,year_str,month_str
1726,2,2022-12-01,Dec,1.09,2022,12
1727,2,2023-12-01,Dec,,2023,12


In [36]:
dfst['year_str'] = dfst['period'].dt.strftime('%Y') # create year string
dfst['month_str'] = pd.to_datetime(dfst['mthcode'], format='%b').dt.strftime('%m') # create month string
dfst['period'] = pd.to_datetime(dfst['year_str'] + dfst['month_str'], format='%Y%m') # combine strings and convert to date

In [37]:
dfst.tail(2)

Unnamed: 0,zone,period,mthcode,deviation,year_str,month_str
1726,3,2022-12-01,Dec,0.52,2022,12
1727,3,2023-12-01,Dec,,2023,12


### drop unneeded columns

In [38]:
cols = ['mthcode', 'year_str', 'month_str']
dfgt.drop(columns=cols,inplace=True) # drop unneeded columns
dfnt.drop(columns=cols,inplace=True) # drop unneeded columns
dfst.drop(columns=cols,inplace=True) # drop unneeded columns

### extract year and month from date using date accessor

In [39]:
dfgt['year'] = dfgt['period'].dt.year
dfnt['year'] = dfnt['period'].dt.year
dfst['year'] = dfst['period'].dt.year

In [40]:
dfgt['month'] = dfgt['period'].dt.month
dfnt['month'] = dfnt['period'].dt.month
dfst['month'] = dfst['period'].dt.month

### add scale identifier for db

In [41]:
# set to 1 which will become 'celsius' in db
dfgt['scale'] = 1
dfnt['scale'] = 1
dfst['scale'] = 1

### reorder columns

In [42]:
dfgt = dfgt.reindex(['period','zone','scale','year','month','deviation'], axis=1)
dfgt.columns

Index(['period', 'zone', 'scale', 'year', 'month', 'deviation'], dtype='object')

In [43]:
dfgt.tail(2)

Unnamed: 0,period,zone,scale,year,month,deviation
1726,2022-12-01,1,1,2022,12,0.81
1727,2023-12-01,1,1,2023,12,


In [44]:
dfnt = dfnt.reindex(['period','zone','scale','year','month','deviation'], axis=1)
dfnt.columns

Index(['period', 'zone', 'scale', 'year', 'month', 'deviation'], dtype='object')

In [45]:
dfnt.tail(2)

Unnamed: 0,period,zone,scale,year,month,deviation
1726,2022-12-01,2,1,2022,12,1.09
1727,2023-12-01,2,1,2023,12,


In [46]:
dfst = dfst.reindex(['period','zone','scale','year','month','deviation'], axis=1)
dfst.columns

Index(['period', 'zone', 'scale', 'year', 'month', 'deviation'], dtype='object')

In [47]:
dfst.tail(2)

Unnamed: 0,period,zone,scale,year,month,deviation
1726,2022-12-01,3,1,2022,12,0.52
1727,2023-12-01,3,1,2023,12,


### sort rows

In [48]:
dfgt.sort_values(by=['period'], inplace=True)

In [49]:
dfgt.head(2)

Unnamed: 0,period,zone,scale,year,month,deviation
0,1880-01-01,1,1,1880,1,-0.18
144,1880-02-01,1,1,1880,2,-0.24


In [50]:
dfnt.sort_values(by=['period'], inplace=True)

In [51]:
dfnt.head(2)

Unnamed: 0,period,zone,scale,year,month,deviation
0,1880-01-01,2,1,1880,1,-0.35
144,1880-02-01,2,1,1880,2,-0.51


In [52]:
dfst.sort_values(by=['period'], inplace=True)

In [53]:
dfst.head(2)

Unnamed: 0,period,zone,scale,year,month,deviation
0,1880-01-01,3,1,1880,1,0.0
144,1880-02-01,3,1,1880,2,0.03


### reset index

In [54]:
dfgt = dfgt.reset_index(drop=True);

In [55]:
dfnt = dfnt.reset_index(drop=True);

In [56]:
dfst = dfst.reset_index(drop=True);

In [57]:
dfgt.head(2)

Unnamed: 0,period,zone,scale,year,month,deviation
0,1880-01-01,1,1,1880,1,-0.18
1,1880-02-01,1,1,1880,2,-0.24


<a id='pd-export'></a>
## Export

[Return to Start of Notebook](#pd)   

### save df as csv for importing to database

In [58]:
save_path = os.path.join(save_dir, 'glb_temperature_deviations.csv')
dfgt.to_csv(save_path, header=True, index=False, sep=',', na_rep='NULL')

In [59]:
save_path = os.path.join(save_dir, 'nh_temperature_deviations.csv')
dfnt.to_csv(save_path, header=True, index=False, sep=',', na_rep='NULL')

In [60]:
save_path = os.path.join(save_dir, 'sh_temperature_deviations.csv')
dfst.to_csv(save_path, header=True, index=False, sep=',', na_rep='NULL')