# Chapter 33 - Exporting Data

## Imports

In [33]:
import pandas as pd
import sqlite3
import sqlalchemy as sa

## 33.1 Dirty Devil Data

In [3]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/dirtydevil.txt'

In [5]:
df = pd.read_csv(url, skiprows=lambda num: num <34 or num==35, sep='\t')

  df = pd.read_csv(url, skiprows=lambda num: num <34 or num==35, sep='\t')


In [7]:
def to_denver_time(df_, time_col, tz_col):
    return (df_
            .assign(**{tz_col: df_[tz_col].replace('MDT', 'MST7MDT')})
            .groupby(tz_col)
            [time_col]
            .transform(lambda s: pd.to_datetime(s)
                       .dt.tz_localize(s.name, ambiguous=True)
                       .dt.tz_convert('America/Denver')
                       )
            )

In [8]:
def tweak_river(df_):
    return (df_
            .assign(datetime=to_denver_time(df_, 'datetime', 'tz_cd'))
            .rename(columns={'144166_00060': 'cfs',
                             '144167_00065': 'gage_height'})
            .set_index('datetime')
            )

In [9]:
dd = tweak_river(df)

In [10]:
dd

Unnamed: 0_level_0,agency_cd,site_no,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-05-07 01:00:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:15:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:30:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:45:00-06:00,USGS,9333500,MDT,70.00,A:[91],,
2001-05-07 02:00:00-06:00,USGS,9333500,MDT,70.00,A:[91],,
...,...,...,...,...,...,...,...
2020-09-28 08:30:00-06:00,USGS,9333500,MDT,9.53,P,6.16,P
2020-09-28 08:45:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 09:00:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 09:15:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P


## 33.2 Reading and Writing

## 33.3 Creating CSV Files

In [11]:
print(dd.head(5).to_csv())

datetime,agency_cd,site_no,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
2001-05-07 01:00:00-06:00,USGS,9333500,MDT,71.0,A:[91],,
2001-05-07 01:15:00-06:00,USGS,9333500,MDT,71.0,A:[91],,
2001-05-07 01:30:00-06:00,USGS,9333500,MDT,71.0,A:[91],,
2001-05-07 01:45:00-06:00,USGS,9333500,MDT,70.0,A:[91],,
2001-05-07 02:00:00-06:00,USGS,9333500,MDT,70.0,A:[91],,



In [12]:
dd.to_csv('tmp/dd.csv')

In [14]:
dd2 = pd.read_csv('tmp/dd.csv', index_col='datetime')

  dd2 = pd.read_csv('tmp/dd.csv', index_col='datetime')


In [15]:
dd2

Unnamed: 0_level_0,agency_cd,site_no,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-05-07 01:00:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:15:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:30:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:45:00-06:00,USGS,9333500,MDT,70.00,A:[91],,
2001-05-07 02:00:00-06:00,USGS,9333500,MDT,70.00,A:[91],,
...,...,...,...,...,...,...,...
2020-09-28 08:30:00-06:00,USGS,9333500,MDT,9.53,P,6.16,P
2020-09-28 08:45:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 09:00:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 09:15:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P


## Exporting to Excel

In [16]:
dd.to_excel('tmp/dd.xlsx')

ValueError: Excel does not support datetimes with timezones. Please ensure that datetimes are timezone unaware before writing to Excel.

In [17]:
(dd
 .reset_index()
 .assign(datetime=lambda df_: df_.datetime.dt.tz_convert(tz=None))
 .set_index('datetime')
 .to_excel('tmp/dd.xlsx')
 )

In [18]:
writer = pd.ExcelWriter('tmp/dd2.xlsx')

In [19]:
dd2 = (dd
       .reset_index()
       .assign(datetime=lambda df_: df_.datetime.dt.tz_convert(tz=None))
       .set_index('datetime')
       )

In [20]:
(dd2
 .loc['2010':'2010-12-31']
 .to_excel(writer, sheet_name='2010')
 )

In [21]:
(dd2
 .loc['2011':'2011-12-31']
 .to_excel(writer, sheet_name='2011')
 )

In [22]:
writer.save()

  writer.save()


## 33.5 Feather

In [24]:
dd.to_feather('tmp/dd.fea')

ValueError: feather does not support serializing <class 'pandas.core.indexes.datetimes.DatetimeIndex'> for the index; you can .reset_index() to make the index into column(s)

In [25]:
(dd
 .reset_index()
 .to_feather('tmp/dd.fea')
 )

In [26]:
dd2 = pd.read_feather('tmp/dd.fea')

In [27]:
dd2.set_index('datetime').equals(dd)

True

## 33.6 SQL

In [29]:
con = sqlite3.connect('dd.db')

In [43]:
dd.head()

Unnamed: 0_level_0,agency_cd,site_no,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-05-07 01:00:00-06:00,USGS,9333500,MDT,71.0,A:[91],,
2001-05-07 01:15:00-06:00,USGS,9333500,MDT,71.0,A:[91],,
2001-05-07 01:30:00-06:00,USGS,9333500,MDT,71.0,A:[91],,
2001-05-07 01:45:00-06:00,USGS,9333500,MDT,70.0,A:[91],,
2001-05-07 02:00:00-06:00,USGS,9333500,MDT,70.0,A:[91],,


In [44]:
(dd
 .reset_index()
.assign(datetime=lambda df_: pd.to_datetime(df_['datetime'], utc=True))
.to_sql('dd', con, if_exists='replace')
)

539305

In [45]:
eng = sa.create_engine('sqlite:///dd.db')
sa_con = eng.connect()
dd2 = pd.read_sql('dd', sa_con, index_col='datetime')
dd2.equals(dd)

False

In [63]:
(dd2
 .reset_index()
 .rename(columns={'index':'datetime'})
 .assign(datetime=lambda df_: df_.datetime.dt.tz_localize(tz='UTC')
         .dt.tz_convert('America/Denver'))
 .set_index('datetime')
 .round(3)
 .equals(dd)
 )

True

## 33.7 JSON

In [51]:
obj = dd.to_dict()

In [52]:
dd2 = pd.DataFrame.from_dict(obj)
dd.equals(dd2)

True

In [53]:
dd.to_json('tmp/dd.json.gz')

In [54]:
dd2 = pd.read_json('tmp/dd.json.gz')

In [55]:
dd2

Unnamed: 0,agency_cd,site_no,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
2001-05-07 07:00:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 07:15:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 07:30:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 07:45:00,USGS,9333500,MDT,70.00,A:[91],,
2001-05-07 08:00:00,USGS,9333500,MDT,70.00,A:[91],,
...,...,...,...,...,...,...,...
2020-09-28 14:30:00,USGS,9333500,MDT,9.53,P,6.16,P
2020-09-28 14:45:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 15:00:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 15:15:00,USGS,9333500,MDT,9.20,P,6.15,P


In [56]:
dd2.equals(dd)

False

In [57]:
dd3 = (dd2
       .reset_index()
       .rename(columns={'index':'datetime'})
       .assign(datetime=lambda df_: df_.datetime.dt.tz_localize(tz='UTC')
               .dt.tz_convert('America/Denver'))
       .set_index('datetime')
       )

In [58]:
dd3

Unnamed: 0_level_0,agency_cd,site_no,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2001-05-07 01:00:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:15:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:30:00-06:00,USGS,9333500,MDT,71.00,A:[91],,
2001-05-07 01:45:00-06:00,USGS,9333500,MDT,70.00,A:[91],,
2001-05-07 02:00:00-06:00,USGS,9333500,MDT,70.00,A:[91],,
...,...,...,...,...,...,...,...
2020-09-28 08:30:00-06:00,USGS,9333500,MDT,9.53,P,6.16,P
2020-09-28 08:45:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 09:00:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P
2020-09-28 09:15:00-06:00,USGS,9333500,MDT,9.20,P,6.15,P


In [59]:
dd3.equals(dd)

False

In [60]:
dd3.round(3).equals(dd)

True