<a id='pd'></a>  
# Pandas DataFrames

[Import](#pd-import)  
[Restructure](#pd-restructure)  
[Export](#pd-export)  

### libraries

In [1]:
import os
import calendar
import warnings
import requests
import pandas as pd 
from pandas.api.types import CategoricalDtype
#pd.__version__

In [2]:
# shorter traceback error messages
from functools import partial
get_ipython().showtraceback = partial(get_ipython().showtraceback,exception_only=True)

### directories

In [3]:
home_dir = home_dir = os.path.expanduser("~")
work_dir = os.path.join(home_dir, 'kr', 'df')
data_dir = os.path.join(home_dir, 'kr', 'df', 'data')
save_dir = os.path.join(home_dir, 'kr', 'df', 'save')

In [4]:
def create_directory(directory_name):
    if not os.path.exists(directory_name):
        os.mkdir(directory_name)

In [5]:
create_directory(work_dir)
create_directory(data_dir)
create_directory(save_dir)

In [6]:

os.chdir(work_dir)
os.getcwd()

'/Users/rkforest/kr/df'

### parameters

In [7]:
warnings.filterwarnings('ignore')

In [8]:
# shows result of cell without needing print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

### constants

<a id='pd-import'></a>
## Import

[Return to Start of Notebook](#pd)  

- skiprows  
- sep  
- usecols  
- parse_dates  
- index_col (can also use integer of column)
- na_values  
- nrows  

### url

In [9]:
giss_url = "https://data.giss.nasa.gov/gistemp/tabledata_v4/"

'https://data.giss.nasa.gov/gistemp/tabledata_v4/'

### global monthly

In [10]:
file_id = "GLB"
file_name = file_id + ".Ts+dSST.csv"
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/GLB.Ts+dSST.csv'

In [11]:
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/GLB.Ts+dSST.csv'

In [12]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

12594

In [13]:
dfg = pd.read_csv(file_path, skiprows=1) # initial read for columns
dfg.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,-0.18,-0.24,-0.09,-0.16,-0.1,-0.21,-0.18,-0.1,-0.15,-0.23,-0.22,-0.17,-0.17,***,***,-0.11,-0.16,-0.2


In [14]:
cols_to_import = dfg.columns[0:13]; # start to end+1
dfg = pd.read_csv(file_path,
                  skiprows=1,
                  sep=',',
                  usecols=cols_to_import,
                  parse_dates=['Year'],
                  #index_col=0, # can use either  = 0, = 'Year'
                  na_values=['***'],
                  nrows=150)
dfg = dfg.rename(columns={'Year': 'Date'})
dfg['Zone'] = file_id
dfg.head(1)

Unnamed: 0,Date,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Zone
0,1880-01-01,-0.18,-0.24,-0.09,-0.16,-0.1,-0.21,-0.18,-0.1,-0.15,-0.23,-0.22,-0.17,GLB


### northern hemisphere monthly

In [15]:
file_id = "NH"
file_name = file_id + ".Ts+dSST.csv"
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path


'/Users/rkforest/kr/df/data/NH.Ts+dSST.csv'

In [16]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

12633

In [17]:
dfn = pd.read_csv(file_path, skiprows=1) # initial read for columns
dfn.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,-0.35,-0.51,-0.22,-0.29,-0.06,-0.16,-0.18,-0.26,-0.23,-0.32,-0.43,-0.4,-0.28,***,***,-0.19,-0.2,-0.33


In [18]:
cols_to_import = dfn.columns[0:13]; # start to end+1
dfn = pd.read_csv(file_path,
                  skiprows=1,
                  sep=',',
                  usecols=cols_to_import,
                  parse_dates=['Year'],
                  #index_col=0, # can use either  = 0, = 'Year'
                  na_values=['***'],
                  nrows=150)
dfn = dfn.rename(columns={'Year': 'Date'})
dfn['Zone'] = file_id
dfn.head(1)

Unnamed: 0,Date,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Zone
0,1880-01-01,-0.35,-0.51,-0.22,-0.29,-0.06,-0.16,-0.18,-0.26,-0.23,-0.32,-0.43,-0.4,NH


### southern hemisphere monthly

In [19]:
file_id = "SH"
file_name = file_id + ".Ts+dSST.csv"
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/SH.Ts+dSST.csv'

In [20]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

12630

In [21]:
dfs = pd.read_csv(file_path, skiprows=1) # initial read for columns
dfs.head(1)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
0,1880,0.0,0.03,0.05,-0.02,-0.13,-0.25,-0.18,0.06,-0.05,-0.15,-0.01,0.05,-0.05,***,***,-0.03,-0.12,-0.07


In [22]:

cols_to_import = dfs.columns[0:13]; # start to end+1
dfs = pd.read_csv(file_path,
                  skiprows=1,
                  sep=',',
                  usecols=cols_to_import,
                  parse_dates=['Year'],
                  #index_col=0, # can use either  = 0, = 'Year'
                  na_values=['***'],
                  nrows=150)
dfs = dfs.rename(columns={'Year': 'Date'})
dfs['Zone'] = file_id
dfs.head(1)

Unnamed: 0,Date,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Zone
0,1880-01-01,0.0,0.03,0.05,-0.02,-0.13,-0.25,-0.18,0.06,-0.05,-0.15,-0.01,0.05,SH


### zonal annual

In [23]:
file_name = 'ZonAnn.Ts+dSST.csv'
file_url = giss_url + file_name
file_path = os.path.join(data_dir, file_name)
file_path

'/Users/rkforest/kr/df/data/ZonAnn.Ts+dSST.csv'

In [24]:
# download file_url and save to file_path
r = requests.get(file_url)
open(file_path, 'wb').write(r.content)

9986

In [25]:
dfz = pd.read_csv(file_path, skiprows=0) # initial read for columns
dfz.head(1)

Unnamed: 0,Year,Glob,NHem,SHem,24N-90N,24S-24N,90S-24S,64N-90N,44N-64N,24N-44N,EQU-24N,24S-EQU,44S-24S,64S-44S,90S-64S
0,1880,-0.17,-0.28,-0.05,-0.37,-0.13,-0.02,-0.81,-0.46,-0.28,-0.16,-0.11,-0.05,0.05,0.66


In [26]:
cols_to_import=['Year',
                'EQU-24N','24N-44N','44N-64N','64N-90N',
                '24S-EQU','44S-24S','64S-44S','90S-64S']
dfz = pd.read_csv(file_path, usecols=cols_to_import)
dfz.head(1)

Unnamed: 0,Year,64N-90N,44N-64N,24N-44N,EQU-24N,24S-EQU,44S-24S,64S-44S,90S-64S
0,1880,-0.81,-0.46,-0.28,-0.16,-0.11,-0.05,0.05,0.66


<a id='pd-restructure'></a>
## Restructure

[Return to Start of Notebook](#pd)  

### .melt

In [27]:
dfg.head(2)

Unnamed: 0,Date,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Zone
0,1880-01-01,-0.18,-0.24,-0.09,-0.16,-0.1,-0.21,-0.18,-0.1,-0.15,-0.23,-0.22,-0.17,GLB
1,1881-01-01,-0.19,-0.14,0.03,0.05,0.06,-0.18,0.0,-0.03,-0.15,-0.22,-0.18,-0.07,GLB


In [28]:
dfgt = pd.melt(dfg,
               id_vars=['Zone','Date'],
               var_name=['Mth'],
               value_name='Temp')
dfgt.head(1)

Unnamed: 0,Zone,Date,Mth,Temp
0,GLB,1880-01-01,Jan,-0.18


In [29]:
dfnt = pd.melt(dfn,
               id_vars=['Zone','Date'],
               var_name=['Mth'],
               value_name='Temp')
dfnt.head(1)

Unnamed: 0,Zone,Date,Mth,Temp
0,NH,1880-01-01,Jan,-0.35


In [30]:
dfst = pd.melt(dfs,
               id_vars=['Zone','Date'],
               var_name=['Mth'],
               value_name='Temp')
dfst.head(1)

Unnamed: 0,Zone,Date,Mth,Temp
0,SH,1880-01-01,Jan,0.0


In [31]:
dfzt = pd.melt(dfz,
             id_vars=['Year'],
             var_name=['Zone'],
             value_name='Temp')
dfzt.head(1)

Unnamed: 0,Year,Zone,Temp
0,1880,64N-90N,-0.81


### create date with pd.to_datetime
- using year and month strings 

In [32]:
dfgt['year_str'] = dfgt['Date'].dt.strftime('%Y') # create year string
dfgt['month_str'] = pd.to_datetime(dfgt['Mth'], format='%b').dt.strftime('%m') # create month string
dfgt['Date'] = pd.to_datetime(dfgt['year_str'] + dfgt['month_str'], format='%Y%m') # combine strings and convert to date

In [33]:
dfnt['year_str'] = dfnt['Date'].dt.strftime('%Y') # create year string
dfnt['month_str'] = pd.to_datetime(dfnt['Mth'], format='%b').dt.strftime('%m') # create month string
dfnt['Date'] = pd.to_datetime(dfnt['year_str'] + dfnt['month_str'], format='%Y%m') # combine strings and convert to date

In [34]:
dfst['year_str'] = dfst['Date'].dt.strftime('%Y') # create year string
dfst['month_str'] = pd.to_datetime(dfst['Mth'], format='%b').dt.strftime('%m') # create month string
dfst['Date'] = pd.to_datetime(dfst['year_str'] + dfst['month_str'], format='%Y%m') # combine strings and convert to date

### drop unneeded columns

In [35]:
cols = ['year_str', 'month_str']
dfgt.drop(columns=cols,inplace=True) # drop unneeded columns
dfnt.drop(columns=cols,inplace=True) # drop unneeded columns
dfst.drop(columns=cols,inplace=True) # drop unneeded columns

### extract year and month from date using date accessor

In [36]:
dfgt['Year'] = dfgt['Date'].dt.year
dfnt['Year'] = dfnt['Date'].dt.year
dfst['Year'] = dfst['Date'].dt.year

In [37]:
dfgt['Month'] = dfgt['Date'].dt.month
dfnt['Month'] = dfnt['Date'].dt.month
dfst['Month'] = dfst['Date'].dt.month

### reorder columns

In [38]:
dfgt = dfgt.reindex(['Date','Year','Month','Mth','Zone','Temp'], axis=1)
dfgt.columns

Index(['Date', 'Year', 'Month', 'Mth', 'Zone', 'Temp'], dtype='object')

In [39]:
dfnt = dfnt.reindex(['Date','Year','Month','Mth','Zone','Temp'], axis=1)
dfnt.columns

Index(['Date', 'Year', 'Month', 'Mth', 'Zone', 'Temp'], dtype='object')

In [40]:
dfst = dfst.reindex(['Date','Year','Month','Mth','Zone','Temp'], axis=1)
dfst.columns

Index(['Date', 'Year', 'Month', 'Mth', 'Zone', 'Temp'], dtype='object')

In [41]:
dfgt.tail(1)

Unnamed: 0,Date,Year,Month,Mth,Zone,Temp
1727,2023-12-01,2023,12,Dec,GLB,


<a id='pd-export'></a>
## Export

[Return to Start of Notebook](#pd)   

### save df as csv for importing to database

In [43]:
save_path = os.path.join(save_dir, 'global_temperature_anomalies.csv')
dfgt.to_csv(save_path, header=True, index=False, sep=',')

In [44]:
save_path = os.path.join(save_dir, 'northern_temperature_anomalies.csv')
dfnt.to_csv(save_path, header=True, index=False, sep=',')

In [45]:
save_path = os.path.join(save_dir, 'southern_temperature_anomalies.csv')
dfst.to_csv(save_path, header=True, index=False, sep=',')