In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

%matplotlib inline
sns.set()

""" 
DSC672
Team: John Matune, Mike Yacobucci, Steve Rummel
Project: Energy Consumption and Renewable Energy
Purpose: 

Import solar and wind data from the relevant source files, 
convert string-format date and time into valid datetime values,
remove unneeded columns, normalize column headers, and
export to canonical data source for further analysis.
"""

solar_prod = pd.read_csv('./raw/solararray_production.csv')
wind_prod = pd.read_csv('./raw/windfarm_production.csv')

solar_prod.rename(index=str, columns={"Electricity_KW_HR": "KWH"}, inplace=True)
solar_prod['Source'] = 'Solar'
wind_prod.rename(index=str, columns={"Electricity_KW_HR": "KWH"}, inplace=True)
wind_prod['Source'] = 'Wind'

power_production = pd.concat([solar_prod, wind_prod], ignore_index=True, sort=False)
print(power_production['Hour'].unique())
power_production['Hour'] = power_production['Hour'].astype(int) - 1
power_production['Date'] = pd.to_datetime(power_production['Date'])
power_production['Time'] =  power_production['Date'] + pd.to_timedelta(power_production['Hour'], unit='h')

power_production.set_index('Time', inplace=True)

power_production.fillna(0.0, inplace=True)
power_production.to_pickle("./processed/production_all_src.pkl")

[16 17 10 11 12 13 14 15 18  9 19 20 21  8 22  7  1  2  3  4  5  6 23 24]


In [2]:
# Build a dataset of 'everything'

solar = power_production.loc[power_production['Source'] == 'Solar']
wind = power_production.loc[power_production['Source'] == 'Wind']

# Export production for all dates covered, regardless of source.
sol_all_dte = pd.Series(solar['KWH'], name='Solar')
win_all_dte = pd.Series(wind['KWH'], name='Wind')
all_src_all_dte = pd.concat([sol_all_dte, win_all_dte], axis=1)
all_src_all_dte['Solar'].fillna(0.0, inplace=True)
all_src_all_dte['Wind'].fillna(0.0, inplace=True)

# Add: Windspeed
wind_speed = pd.read_csv('./raw/windfarm_windspeed.csv')
wind_speed.rename(index=str, columns={"Date_time": "Time"}, inplace=True)
wind_speed['Time'] = pd.to_datetime(wind_speed['Time'], format='%d%b%y:%H:%M:%S')
wind_speed.drop(['Hour'], axis=1, inplace=True)
wind_speed.drop(['Location'], axis=1, inplace=True)
wind_speed.set_index('Time', inplace=True)

all_src_all_dte.rename(index=str, columns={"Wind": "Wind_KWH"}, inplace=True)
all_src_all_dte.rename(index=str, columns={"Solar": "Solar_KWH"}, inplace=True)
all_src_all_dte = pd.concat([all_src_all_dte, wind_speed], axis=1)
all_src_all_dte['Wind_Speed'].fillna(0.0, inplace=True)



In [3]:
# Add: Solar Angle
s_angle = pd.read_csv('./raw/solararray_solarangle.csv')
#s_angle['Date'] = pd.to_datetime(s_angle['Date'])

for field in list(s_angle):
    s_angle[field].fillna(0.0, inplace=True)
    s_angle[field] = s_angle[field].astype(str)
    
# Pad out the Month, Day and Hour values because .to_datetime wont processed unpadded integers.
s_angle['Month'] = s_angle['Month'].apply('{:0>2}'.format)
s_angle['Day'] = s_angle['Day'].apply('{:0>2}'.format)
s_angle['Hour'] = s_angle['Hour'].apply('{:0>2}'.format)

# Create a string version of  timestamp.
s_angle['Time'] = s_angle[['Year', 'Month', 'Day']].apply(lambda x: ':'.join(x), axis=1)

# Cast the string to a datetime object.
# Note: Adding the hour at the end kept erroring out, so did it the brute
# force way by adding it after as a timedelta.
s_angle['Time'] = pd.to_datetime(s_angle['Time'], format='%Y:%m:%d')
s_angle['Time'] +=  pd.to_timedelta(s_angle['Hour'].astype(int), unit='h')

# Get rid of columns we do not need.
drop_these = ['Year', 'Month', 'Day', 'Hour', 'Location']
for field in drop_these:
    s_angle.drop([field], axis=1, inplace=True)
    
s_angle.set_index('Time', inplace=True)

all_src_all_dte = all_src_all_dte.join(s_angle, how='left', sort=False)

In [4]:
# Add Weather

weather = pd.read_csv('./raw/solararray_weather.csv')

for field in list(weather):
    weather[field].fillna(0.0, inplace=True)
    weather[field] = weather[field].astype(str)
    
# Pad out the Month, Day and Hour values because .to_datetime wont processed unpadded integers.
weather['Month'] = weather['Month'].apply('{:0>2}'.format)
weather['Day'] = weather['Day'].apply('{:0>2}'.format)
weather['Hour'] = weather['Hour'].apply('{:0>2}'.format)

# Create a string version of  timestamp.
weather['Time'] = weather[['Year', 'Month', 'Day']].apply(lambda x: ':'.join(x), axis=1)

weather['Time'] = pd.to_datetime(weather['Time'], format='%Y:%m:%d')
weather['Time'] +=  pd.to_timedelta(weather['Hour'].astype(int), unit='h')
weather.set_index('Time', inplace=True)

drop_these = ['Year', 'Month', 'Day', 'Hour', 'Location']
for field in drop_these:
    weather.drop([field], axis=1, inplace=True)

all_src_all_dte = all_src_all_dte.join(weather, how='left', lsuffix='_AT_WINDFARM', rsuffix='_AT_SOLARRAY', sort=False)

for field in list(all_src_all_dte):
    all_src_all_dte[field].fillna(0.0, inplace=True)
    all_src_all_dte[field] = pd.to_numeric(all_src_all_dte[field])

all_src_all_dte.reset_index(inplace=True)
print(all_src_all_dte.head())
print(all_src_all_dte.dtypes)

                 Time  Solar_KWH  Wind_KWH  Wind_Speed_AT_WINDFARM  \
0 2011-03-24 19:00:00        0.0       0.0                     4.7   
1 2011-03-24 20:00:00        0.0       0.0                     5.3   
2 2011-03-24 21:00:00        0.0       0.0                     6.4   
3 2011-03-24 22:00:00        0.0       0.0                     6.1   
4 2011-03-24 23:00:00        0.0       0.0                     5.5   

   Solar_Elevation  Cloud_Cover_Fraction  Dew_Point  Humidity_Fraction  \
0        19.498113                   0.0      -12.8             0.4238   
1         8.891553                   0.0      -11.7             0.5215   
2        -1.942140                   0.0      -11.7             0.5664   
3       -12.634295                   0.0      -12.8             0.5183   
4       -22.788566                   0.0      -13.9             0.5645   

   Precipitation  Pressure  Temperature  Visibility  Wind_Speed_AT_SOLARRAY  
0            0.0     994.9         -1.7      16.093     

In [5]:
#Add a Timeseries index to the DataFrame
all_src_all_dte.index = pd.DatetimeIndex(all_src_all_dte.Time)
all_src_all_dte.sort_index(inplace=True)
all_src_all_dte.head()

Unnamed: 0_level_0,Time,Solar_KWH,Wind_KWH,Wind_Speed_AT_WINDFARM,Solar_Elevation,Cloud_Cover_Fraction,Dew_Point,Humidity_Fraction,Precipitation,Pressure,Temperature,Visibility,Wind_Speed_AT_SOLARRAY
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2010-01-04 15:00:00,2010-01-04 15:00:00,14186.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-04 16:00:00,2010-01-04 16:00:00,5898.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-05 09:00:00,2010-01-05 09:00:00,12421.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-05 10:00:00,2010-01-05 10:00:00,14210.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010-01-05 11:00:00,2010-01-05 11:00:00,14723.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
all_src_all_dte.to_pickle("./processed/production_all_dates_and_variables.pkl")

In [7]:
fields = [['Solar_KWH'],
          ['Wind_KWH'],
          ['Wind_Speed_AT_WINDFARM'],
          ['Solar_Elevation'], 
          ['Cloud_Cover_Fraction'],
          ['Dew_Point'], 
          ['Humidity_Fraction'],
          ['Precipitation'],
          ['Pressure'], 
          ['Temperature'], 
          ['Visibility']
         ]

In [None]:
for field in fields:
    print("{}: \n{}".format(field[0], all_src_all_dte[field].describe()))

In [8]:
# Print out some nice charts of our variables.
for field in fields:
    print("Plotting {}...".format(field[0]))
    chart_dims = (16, 4)
    fig, ax = plt.subplots(figsize=chart_dims)
    sns.set()
    snplot = sns.lineplot(data=all_src_all_dte[field[0]],
                          ax=ax,
                          linewidth=1,
                          );
    snplot.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));
    snplot.set_title(field[0]);
    print("Saving {}...".format(field[0]))
    #snplot.savefig("{}.png".format(field[0]));
    exp = snplot.get_figure()
    exp.savefig("./Images/{}.png".format(field[0]))
    fig.clf()
    exp.clf()

Plotting Solar_KWH...


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


Saving Solar_KWH...
Plotting Wind_KWH...
Saving Wind_KWH...
Plotting Wind_Speed_AT_WINDFARM...
Saving Wind_Speed_AT_WINDFARM...
Plotting Solar_Elevation...
Saving Solar_Elevation...
Plotting Cloud_Cover_Fraction...
Saving Cloud_Cover_Fraction...
Plotting Dew_Point...
Saving Dew_Point...
Plotting Humidity_Fraction...
Saving Humidity_Fraction...
Plotting Precipitation...
Saving Precipitation...
Plotting Pressure...
Saving Pressure...
Plotting Temperature...
Saving Temperature...
Plotting Visibility...
Saving Visibility...


<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>

<Figure size 1152x288 with 0 Axes>