In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

%matplotlib inline
sns.set()

""" 
DSC672
Team: John Matune, Mike Yacobucci, Steve Rummel
Project: Energy Consumption and Renewable Energy
Purpose: 

Import solar and wind data from the relevant source files, 
convert string-format date and time into valid datetime values,
remove unneeded columns, normalize column headers, and
export to canonical data source for further analysis.
"""

solar_prod = pd.read_csv('./raw/solararray_production.csv')
wind_prod = pd.read_csv('./raw/windfarm_production.csv')

solar_prod.rename(index=str, columns={"Electricity_KW_HR": "Solar_KWH"}, inplace=True)
solar_prod['Source'] = 'Solar'
wind_prod.rename(index=str, columns={"Electricity_KW_HR": "Wind_KWH"}, inplace=True)
wind_prod['Source'] = 'Wind'

for item in [solar_prod, wind_prod]:
    item['Hour'] = item['Hour'].astype(int) - 1
    item['Date'] = pd.to_datetime(item['Date'])
    item['Time'] = item['Date'] + pd.to_timedelta(item['Hour'], unit='h')
    item.set_index('Time', inplace=True)
    #item.fillna(0.0, inplace=True)
    item.drop(['Hour'], axis=1, inplace=True)
    item.drop(['Date'], axis=1, inplace=True)
    item.drop(['Source'], axis=1, inplace=True)

power_production = wind_prod.join(solar_prod, how='outer', sort=False)

#power_production.fillna(0.0, inplace=True)

test_pivot = power_production.groupby(power_production.index).size().reset_index(name='Count')
print(test_pivot.loc[test_pivot['Count'] > 1])

power_production.to_pickle("./processed/production_all_src.pkl")

Empty DataFrame
Columns: [Time, Count]
Index: []


In [2]:
# Build a dataset of 'everything'

#solar = power_production.loc[power_production['Source'] == 'Solar']
#wind = power_production.loc[power_production['Source'] == 'Wind']

# Export production for all dates covered, regardless of source.
#sol_all_dte = pd.Series(solar['KWH'], name='Solar')
#win_all_dte = pd.Series(wind['KWH'], name='Wind')
#all_src_all_dte = pd.concat([sol_all_dte, win_all_dte], axis=1)
#all_src_all_dte['Solar'].fillna(0.0, inplace=True)
#all_src_all_dte['Wind'].fillna(0.0, inplace=True)

# Add: Windspeed
wind_speed = pd.read_csv('./raw/windfarm_windspeed.csv')
wind_speed.rename(index=str, columns={"Date_time": "Time"}, inplace=True)
wind_speed['Time'] = pd.to_datetime(wind_speed['Time'], format='%d%b%y:%H:%M:%S')
wind_speed.drop(['Hour'], axis=1, inplace=True)
wind_speed.drop(['Location'], axis=1, inplace=True)
wind_speed.set_index('Time', inplace=True)

all_src_all_dte = power_production.join(wind_speed, how='left', sort=False)
#all_src_all_dte['Wind_Speed'].fillna(0.0, inplace=True)

#print(all_src_all_dte.head())

test_pivot = all_src_all_dte.groupby(all_src_all_dte.index).size().reset_index(name='Count')
print(test_pivot.loc[test_pivot['Count'] > 1])

Empty DataFrame
Columns: [Time, Count]
Index: []


In [3]:
# Add: Solar Angle
s_angle = pd.read_csv('./raw/solararray_solarangle_ETL.csv')
#s_angle['Date'] = pd.to_datetime(s_angle['Date'])

for field in list(s_angle):
    #s_angle[field].fillna(0.0, inplace=True)
    s_angle[field] = s_angle[field].astype(str)
    
# Pad out the Month, Day and Hour values because .to_datetime wont processed unpadded integers.
s_angle['Month'] = s_angle['Month'].apply('{:0>2}'.format)
s_angle['Day'] = s_angle['Day'].apply('{:0>2}'.format)
s_angle['Hour'] = s_angle['Hour'].apply('{:0>2}'.format)

# Create a string version of  timestamp.
s_angle['Time'] = s_angle[['Year', 'Month', 'Day']].apply(lambda x: ':'.join(x), axis=1)

# Cast the string to a datetime object.
# Note: Adding the hour at the end kept erroring out, so did it the brute
# force way by adding it after as a timedelta.
s_angle['Time'] = pd.to_datetime(s_angle['Time'], format='%Y:%m:%d')
s_angle['Time'] +=  pd.to_timedelta(s_angle['Hour'].astype(int), unit='h')

# Get rid of columns we do not need.
drop_these = ['Year', 'Month', 'Day', 'Hour', 'Location']
for field in drop_these:
    s_angle.drop([field], axis=1, inplace=True)
    
s_angle.set_index('Time', inplace=True)

test_pivot = s_angle.groupby(s_angle.index).size().reset_index(name='Count')
print(test_pivot.loc[test_pivot['Count'] > 1])

all_src_all_dte = all_src_all_dte.join(s_angle, how='left', sort=False)

test_pivot = all_src_all_dte.groupby(all_src_all_dte.index).size().reset_index(name='Count')
print(test_pivot.loc[test_pivot['Count'] > 1])

Empty DataFrame
Columns: [Time, Count]
Index: []
Empty DataFrame
Columns: [Time, Count]
Index: []


In [4]:
# Add Weather

weather = pd.read_csv('./raw/solararray_weather.csv')

for field in list(weather):
    #weather[field].fillna(0.0, inplace=True)
    weather[field] = weather[field].astype(str)
    
# Pad out the Month, Day and Hour values because .to_datetime wont processed unpadded integers.
weather['Month'] = weather['Month'].apply('{:0>2}'.format)
weather['Day'] = weather['Day'].apply('{:0>2}'.format)
weather['Hour'] = weather['Hour'].apply('{:0>2}'.format)

# Create a string version of  timestamp.
weather['Time'] = weather[['Year', 'Month', 'Day']].apply(lambda x: ':'.join(x), axis=1)

weather['Time'] = pd.to_datetime(weather['Time'], format='%Y:%m:%d')
weather['Time'] +=  pd.to_timedelta(weather['Hour'].astype(int), unit='h')
weather.set_index('Time', inplace=True)

drop_these = ['Year', 'Month', 'Day', 'Hour', 'Location']
for field in drop_these:
    weather.drop([field], axis=1, inplace=True)

all_src_all_dte = all_src_all_dte.join(weather, how='left', lsuffix='_AT_WINDFARM', rsuffix='_AT_SOLARRAY', sort=False)

#for field in list(all_src_all_dte):
    #all_src_all_dte[field].fillna(0.0, inplace=True)
    #all_src_all_dte[field] = pd.to_numeric(all_src_all_dte[field])

all_src_all_dte.reset_index(inplace=True)

test_pivot = all_src_all_dte.groupby(all_src_all_dte.index).size().reset_index(name='Count')
print(test_pivot.loc[test_pivot['Count'] > 1])    
all_src_all_dte.head()

ValueError: Unable to parse string "nan" at position 1817

In [None]:
#Add a Timeseries index to the DataFrame
all_src_all_dte.index = pd.DatetimeIndex(all_src_all_dte.Time)
#all_src_all_dte.sort_index(inplace=True)
test_pivot = all_src_all_dte.groupby(all_src_all_dte.index).size().reset_index(name='Count')
print(test_pivot.loc[test_pivot['Count'] > 1])    
all_src_all_dte.head()

In [None]:
test_pivot = all_src_all_dte.groupby(all_src_all_dte.index).size().reset_index(name='Count')
print(test_pivot.loc[test_pivot['Count'] > 1])

all_src_all_dte.to_pickle("./processed/production_all_dates_and_variables.pkl")

In [None]:
fields = [['Solar_KWH'],
          ['Wind_KWH'],
          ['Wind_Speed_AT_WINDFARM'],
          ['Solar_Elevation'], 
          ['Cloud_Cover_Fraction'],
          ['Dew_Point'], 
          ['Humidity_Fraction'],
          ['Precipitation'],
          ['Pressure'], 
          ['Temperature'], 
          ['Visibility']
         ]

In [None]:
for field in fields:
    print("{}: \n{}".format(field[0], all_src_all_dte[field].describe()))

In [None]:
# Print out some nice charts of our variables.
for field in fields:
    print("Plotting {}...".format(field[0]))
    chart_dims = (16, 4)
    fig, ax = plt.subplots(figsize=chart_dims)
    sns.set()
    snplot = sns.lineplot(data=all_src_all_dte[field[0]],
                          ax=ax,
                          linewidth=1,
                          );
    snplot.yaxis.set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'));
    snplot.set_title(field[0]);
    print("Saving {}...".format(field[0]))
    #snplot.savefig("{}.png".format(field[0]));
    exp = snplot.get_figure()
    exp.savefig("./Images/{}.png".format(field[0]))
    fig.clf()
    exp.clf()