In [4]:
import pandas as pd

# Cleaning up PG&E electrical and gas data

In [140]:
# manipulating electrical data
edf = pd.read_csv("./data/electrical.csv")                    # read the data

edf['DATE'] = edf['DATE'].astype('datetime64')                # set the date type
edf['COST'] = edf['COST'].str.replace('$','')                 # remove unnecesary symbols
edf['COST'] = edf['COST'].astype('float64')                   # set the right type
edf = edf.set_index('DATE').resample('D').sum()               # Get the usage and cost total per day
edf = edf.reset_index()
edf.head()

Unnamed: 0,DATE,USAGE,COST
0,2014-07-30,0.48,0.0
1,2014-07-31,0.48,0.0
2,2014-08-01,0.48,0.0
3,2014-08-02,0.48,0.0
4,2014-08-03,0.48,0.0


In [14]:
# manipulating gas data
tmp = pd.read_csv("./data/gas.csv").drop(columns=['TYPE','UNITS','NOTES'])
tmp['COST'] = tmp['COST'].str.replace('$','')

gdf = pd.DataFrame()
gdf['DATE'] = tmp['DATE'].astype('datetime64')
gdf['GAS_COST'] = tmp['COST'].astype('float64')
gdf['GAS_USAGE'] = tmp['USAGE']


gdf.head()

Unnamed: 0,DATE,GAS_COST,GAS_USAGE
0,2014-07-18,0.0,0.0
1,2014-07-19,0.0,0.0
2,2014-07-20,0.0,0.0
3,2014-07-21,0.0,0.0
4,2014-07-22,0.0,0.0


# Cleaning Weather Data

In [165]:
wdf = pd.read_csv("data/weather.csv")
wdf = wdf.loc[wdf.NAME != 'MERCED MUNICIPAL AIRPORT, CA US',['DATE','PRCP','TMAX', 'TMIN']]
wdf['DATE'] = wdf['DATE'].astype('datetime64')

wdf.head()

Unnamed: 0,DATE,PRCP,TMAX,TMIN
0,2014-07-01,0.0,102.0,57.0
1,2014-07-02,0.0,100.0,63.0
2,2014-07-03,0.0,98.0,63.0
3,2014-07-04,0.0,98.0,63.0
4,2014-07-05,0.0,102.0,63.0


# Joining and Saving data

In [168]:
# merging the dataframes

tmp_pge = pd.merge(edf, gdf, how="left")
pge_temp_data = pd.merge(tmp_pge, wdf, how="left")
pge_temp_data.head()

# renaming columns
pge_temp_data.columns = ['date','electrical_usage','electrical_cost','gas_cost','gas_usage','precipitation','max_temp','min_temp']
pge_temp_data.head()

Unnamed: 0,date,electrical_usage,electrical_cost,gas_cost,gas_usage,precipitation,max_temp,min_temp
0,2014-07-30,0.48,0.0,0.0,0.0,0.0,102.0,68.0
1,2014-07-31,0.48,0.0,0.0,0.0,0.0,103.0,69.0
2,2014-08-01,0.48,0.0,0.0,0.0,0.0,102.0,66.0
3,2014-08-02,0.48,0.0,0.0,0.0,0.0,105.0,66.0
4,2014-08-03,0.48,0.0,0.0,0.0,0.0,105.0,65.0


In [170]:
# saving data to disk
pge_temp_data.to_csv("data/pge_and_temperature.csv")                       # save data