In [65]:
import pandas as pd
import numpy as np

""" 
DSC672
Team: Steven Rummel, Ricardo De Leon II, Prabhakaran Raghavan, Sibi Augustin, Tyler Jewell
Project: Energy Consumption and Renewable Energy
Purpose: 

Import solar array data from the relevant source files, 
convert string-format date and time into valid datetime values,
remove unneeded columns, normalize column headers, and
export to canonical data source for further analysis.
"""

solar = pd.read_csv('./raw/solararray_solarangle.csv')

for field in list(solar):
    solar[field] = solar[field].astype(str)

In [66]:
# Pad out the Month, Day and Hour values because .to_datetime wont processed unpadded integers.
solar['Month'] = solar['Month'].apply('{:0>2}'.format)
solar['Day'] = solar['Day'].apply('{:0>2}'.format)
solar['Hour'] = solar['Hour'].apply('{:0>2}'.format)

# Create a string version of  timestamp.
solar['Date'] = solar[['Year', 'Month', 'Day']].apply(lambda x: ':'.join(x), axis=1)

# Cast the string to a datetime object.
# Note: Adding the hour at the end kept erroring out, so did it the brute
# force way by adding it after as a timedelta.
solar['Date'] = pd.to_datetime(solar['Date'], format='%Y:%m:%d')
solar['Date'] +=  pd.to_timedelta(solar['Hour'].astype(int), unit='h')

# Get rid of columns we do not need.
drop_these = ['Year', 'Month', 'Day', 'Hour', 'Location']
for field in drop_these:
    solar.drop([field], axis=1, inplace=True)

In [67]:
solar.to_pickle("./processed/solararray_elevation.pkl")