In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

%matplotlib inline
sns.set()

""" 
DSC672
Team: John Matune, Mike Yacobucci, Steve Rummel
Project: Energy Consumption and Renewable Energy
Purpose: 

Import solar and wind data from the relevant source files, 
convert string-format date and time into valid datetime values,
remove unneeded columns, normalize column headers, and
export to canonical data source for further analysis.
"""

solar_prod = pd.read_csv('./raw/solararray_production.csv')
wind_prod = pd.read_csv('./raw/windfarm_production.csv')

solar_prod.rename(index=str, columns={"Electricity_KW_HR": "KWH"}, inplace=True)
solar_prod['Source'] = 'Solar'
wind_prod.rename(index=str, columns={"Electricity_KW_HR": "KWH"}, inplace=True)
wind_prod['Source'] = 'Wind'

power_production = pd.concat([solar_prod, wind_prod], ignore_index=True, sort=False)
power_production['Date'] = pd.to_datetime(power_production['Date'])
power_production['Time'] =  power_production['Date'] + pd.to_timedelta(power_production['Hour'], unit='h')

power_production.set_index('Time', inplace=True)

power_production.fillna(0.0, inplace=True)
power_production.to_pickle("./processed/production_all_src.pkl")



In [3]:
print(power_production.head())
print(power_production.min())
print(power_production.max())


                          Date  Hour       KWH Source
Time                                                 
2010-01-04 16:00:00 2010-01-04    16  14186.16  Solar
2010-01-04 17:00:00 2010-01-04    17   5898.00  Solar
2010-01-05 10:00:00 2010-01-05    10  12421.86  Solar
2010-01-05 11:00:00 2010-01-05    11  14210.46  Solar
2010-01-05 12:00:00 2010-01-05    12  14723.64  Solar
Date      2010-01-04 00:00:00
Hour                        1
KWH                         0
Source                  Solar
dtype: object
Date      2014-08-31 00:00:00
Hour                       24
KWH                    205619
Source                   Wind
dtype: object


In [None]:
#power_production['KWH'].loc[power_production['Source'] == 'Solar'].plot(figsize=(20,10), linewidth=5, fontsize=20).yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));

In [None]:
#power_production['KWH'].loc[power_production['Source'] == 'Wind'].plot(figsize=(20,10), linewidth=5, fontsize=20).yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));

In [None]:
# Solar and Wind only overlap for ~2 years (2011-2012), so may need to exclude some data where we do not have data, 
# assuming our analysis will require both solar and wind to be available to replace nonrenewable sources.
# Hypothesis: Solar power varies sufficiently little over the course of years that we can safely exclude most of it 
# and consider 2012 as representative. Alternatively, use an average of the years we do have.
# Note: Solar readings were not taken from 6 pm through 9 am, unsurprisingly.

#solar = power_production.loc[power_production['Source'] == 'Solar']
#solar.reset_index(inplace=True)
#solar.set_index('Time', inplace=True)
#print(solar.head())
#print(solar.info())
#solvar = solar.resample('M').var()
#solvar['KWH'].plot(figsize=(20,10), linewidth=5, fontsize=20);

In [None]:
# Solar and Wind only overlap for ~2 years (2011-2012), so may need to exclude some data where we do not have data, 
# assuming our analysis will require both solar and wind to be available to replace nonrenewable sources.
# Hypothesis: Solar power varies sufficiently little over the course of years that we can safely exclude most of it 
# and consider 2012 as representative. Alternatively, use an average of the years we do have.
# Note: Solar readings were not taken from 6 pm through 9 am, unsurprisingly.

#wind = power_production.loc[power_production['Source'] == 'Wind']
#wind.set_index('Time', inplace=True)
#windvar = wind.resample('M').var()
#windvar['KWH'].plot(figsize=(20,10), linewidth=5, fontsize=20);

In [None]:
# Get the date period in which we have both solar and wind values.
# The maximum 'start' date and the minimum 'end' date.

#solar.reset_index(inplace=True)
#solar.set_index('Hour', inplace=True)
#wind.reset_index(inplace=True)
#wind.set_index('Hour', inplace=True)

solmin = solar.min()
solmax = solar.max()
windmin = wind.min()
windmax = wind.max()
mindate = max(solmin['Time'], windmin['Time'])
maxdate = min(solmax['Time'], windmax['Time'])

# Subset our original power_production dataset to only include those dates.
#power_prod_common_timeframe = power_production.loc[(power_production.index > mindate) & (power_production.index < maxdate)]

#power_prod_common_timeframe['KWH'].loc[power_prod_common_timeframe['Source'] == 'Solar'].plot(figsize=(20,10),
#                                                                                              linewidth=5,
#                                                                                              title='KWH by Hour',
#                                                                                              fontsize=20,
#                                                                                              color='y').yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));


In [None]:
#power_prod_common_timeframe['KWH'].loc[power_prod_common_timeframe['Source'] == 'Wind'].plot(figsize=(20,10),
#                                                                                              linewidth=5,
#                                                                                              title='KWH by Hour',
#                                                                                              fontsize=20,
#                                                                                              color='b').yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));

In [None]:
solar.reset_index(inplace=True)
solar.set_index('Time', inplace=True)
wind.reset_index(inplace=True)
wind.set_index('Time', inplace=True)

solar_ctf = solar.loc[(solar.index > mindate) & (solar.index < maxdate)]
wind_ctf = wind.loc[(wind.index > mindate) & (wind.index < maxdate)]

# Export production for all dates covered, regardless of source.
sol_all_dte = pd.Series(solar['KWH'], name='Solar')
win_all_dte = pd.Series(wind['KWH'], name='Wind')
all_src_all_dte = pd.concat([sol_all_dte, win_all_dte], axis=1)
all_src_all_dte['Solar'].fillna(0.0, inplace=True)
all_src_all_dte['Wind'].fillna(0.0, inplace=True)
all_src_all_dte.to_pickle("./processed/production_all_src_all_dte.pkl")

# Export production for all dates covered, only if we have solar and wind data.
sol = pd.Series(solar_ctf['KWH'], name='Solar')
win = pd.Series(wind_ctf['KWH'], name='Wind')
all_src = pd.concat([sol, win], axis=1)
all_src['Solar'].fillna(0.0, inplace=True)
all_src['Wind'].fillna(0.0, inplace=True)
all_src.to_pickle("./processed/production_all_src_from_{}_to_{}.pkl".format(mindate, maxdate))


In [None]:
chart_dims = (12, 4)
fig, ax = plt.subplots(figsize=chart_dims)

sns.set()
snplot = sns.lineplot(data=all_src,
                      ax=ax,
                      linewidth=1,
                      );
snplot.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));
snplot.set_title('KWH by Hour');

In [None]:
chart_dims = (16, 4)
fig, ax = plt.subplots(figsize=chart_dims)

sns.set()
snplot = sns.lineplot(data=all_src_all_dte,
                      ax=ax,
                      linewidth=1,
                      );
snplot.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));
snplot.set_title('KWH by Hour');

In [None]:
# Build a dataset of 'everything'

# Add: Windspeed
wind_speed = pd.read_csv('./raw/windfarm_windspeed.csv')
wind_speed.rename(index=str, columns={"Date_time": "Time"}, inplace=True)
wind_speed['Time'] = pd.to_datetime(wind_speed['Time'], format='%d%b%y:%H:%M:%S')
wind_speed.drop(['Hour'], axis=1, inplace=True)
wind_speed.drop(['Location'], axis=1, inplace=True)
wind_speed.set_index('Time', inplace=True)

all_src_all_dte.rename(index=str, columns={"Wind": "Wind_KWH"}, inplace=True)
all_src_all_dte.rename(index=str, columns={"Solar": "Solar_KWH"}, inplace=True)
all_src_all_dte = pd.concat([all_src_all_dte, wind_speed], axis=1)
all_src_all_dte['Wind_Speed'].fillna(0.0, inplace=True)



In [None]:
# Add: Solar Angle
s_angle = pd.read_csv('./raw/solararray_solarangle.csv')
#s_angle['Date'] = pd.to_datetime(s_angle['Date'])

for field in list(s_angle):
    s_angle[field].fillna(0.0, inplace=True)
    s_angle[field] = s_angle[field].astype(str)
    
# Pad out the Month, Day and Hour values because .to_datetime wont processed unpadded integers.
s_angle['Month'] = s_angle['Month'].apply('{:0>2}'.format)
s_angle['Day'] = s_angle['Day'].apply('{:0>2}'.format)
s_angle['Hour'] = s_angle['Hour'].apply('{:0>2}'.format)

# Create a string version of  timestamp.
s_angle['Time'] = s_angle[['Year', 'Month', 'Day']].apply(lambda x: ':'.join(x), axis=1)

# Cast the string to a datetime object.
# Note: Adding the hour at the end kept erroring out, so did it the brute
# force way by adding it after as a timedelta.
s_angle['Time'] = pd.to_datetime(s_angle['Time'], format='%Y:%m:%d')
s_angle['Time'] +=  pd.to_timedelta(s_angle['Hour'].astype(int), unit='h')

# Get rid of columns we do not need.
drop_these = ['Year', 'Month', 'Day', 'Hour', 'Location']
for field in drop_these:
    s_angle.drop([field], axis=1, inplace=True)
    
s_angle.set_index('Time', inplace=True)

all_src_all_dte = all_src_all_dte.join(s_angle, how='left', sort=False)

In [None]:
# Add Weather

weather = pd.read_csv('./raw/solararray_weather.csv')

for field in list(weather):
    weather[field].fillna(0.0, inplace=True)
    weather[field] = weather[field].astype(str)
    
# Pad out the Month, Day and Hour values because .to_datetime wont processed unpadded integers.
weather['Month'] = weather['Month'].apply('{:0>2}'.format)
weather['Day'] = weather['Day'].apply('{:0>2}'.format)
weather['Hour'] = weather['Hour'].apply('{:0>2}'.format)

# Create a string version of  timestamp.
weather['Time'] = weather[['Year', 'Month', 'Day']].apply(lambda x: ':'.join(x), axis=1)

weather['Time'] = pd.to_datetime(weather['Time'], format='%Y:%m:%d')
weather['Time'] +=  pd.to_timedelta(weather['Hour'].astype(int), unit='h')
weather.set_index('Time', inplace=True)

drop_these = ['Year', 'Month', 'Day', 'Hour', 'Location']
for field in drop_these:
    weather.drop([field], axis=1, inplace=True)

all_src_all_dte = all_src_all_dte.join(weather, how='left', lsuffix='_AT_WINDFARM', rsuffix='_AT_SOLARRAY', sort=False)

for field in list(all_src_all_dte):
    all_src_all_dte[field].fillna(0.0, inplace=True)
    all_src_all_dte[field] = pd.to_numeric(all_src_all_dte[field])

print(all_src_all_dte.head())
print(all_src_all_dte.dtypes)

In [None]:
all_src_all_dte.to_pickle("./processed/production_all_dates_and_variables.pkl")

In [None]:
fields = [['Solar_KWH'],
          ['Wind_KWH'],
          ['Wind_Speed_AT_WINDFARM'],
          ['Solar_Elevation'], 
          ['Cloud_Cover_Fraction'],
          ['Dew_Point'], 
          ['Humidity_Fraction'],
          ['Precipitation'],
          ['Pressure'], 
          ['Temperature'], 
          ['Visibility']
         ]

In [None]:
for field in fields:
    print("{}: \n{}".format(field[0], all_src_all_dte[field].describe()))

In [None]:
# Print out some nice charts of our variables.
for field in fields:
    print("Plotting {}...".format(field[0]))
    chart_dims = (16, 4)
    fig, ax = plt.subplots(figsize=chart_dims)
    sns.set()
    snplot = sns.lineplot(data=all_src_all_dte[field[0]],
                          ax=ax,
                          linewidth=1,
                          );
    snplot.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));
    snplot.set_title(field[0]);
    print("Saving {}...".format(field[0]))
    #snplot.savefig("{}.png".format(field[0]));
    exp = snplot.get_figure()
    exp.savefig("./Images/{}.png".format(field[0]))
    fig.clf()
    exp.clf()