In [5]:
import pandas as pd

We want to read in the site_information.json metadata file to get information about all of the sites, including names, locations, and timezones. We'll keep this in a dictionary site_information. 

In [33]:
import json
with open('site_information.json', 'r') as file:
    site_information = json.load(file)
site_information

{'sites': {'count': 14,
  'site': [{'id': 103941,
    'name': 'Alan Knudson House',
    'accountId': 53921,
    'status': 'Active',
    'peakPower': 10.36,
    'lastUpdateTime': '2020-01-21',
    'currency': 'USD',
    'installationDate': '2015-01-31',
    'ptoDate': None,
    'notes': '',
    'type': 'Optimizers & Inverters',
    'location': {'country': 'United States',
     'state': 'Arizona',
     'city': 'Centennial Park',
     'address': 'Taylor Court 1745',
     'address2': '',
     'zip': '86021',
     'timeZone': 'America/Phoenix',
     'countryCode': 'US',
     'stateCode': 'AZ'},
    'alertQuantity': 0,
    'primaryModule': {'manufacturerName': 'RENESOLA',
     'modelName': 'SE11400A-US',
     'maximumPower': 310.0,
     'temperatureCoef': -0.4},
    'uris': {'PUBLIC_URL': 'https://monitoring.solaredge.com/solaredge-web/p/public?name=Alan Knudson Canopy',
     'DATA_PERIOD': '/site/103941/dataPeriod',
     'DETAILS': '/site/103941/details',
     'OVERVIEW': '/site/103941/over

Read in both data frames to get an idea of what we need to combine. We'll start first with the production data

In [10]:
import os

def read_production_data(house):
    path = f'data/{house}'
    dfs = []
#     production_df = pd.read_csv(f'data/{house}')
    for subdir, dir, files in os.walk(path):
        for file in files:
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path)
            dfs.append(df)
    production_df = pd.concat(dfs)
    return production_df

In [34]:
site = site_information['sites']['site'][0]
site

{'id': 103941,
 'name': 'Alan Knudson House',
 'accountId': 53921,
 'status': 'Active',
 'peakPower': 10.36,
 'lastUpdateTime': '2020-01-21',
 'currency': 'USD',
 'installationDate': '2015-01-31',
 'ptoDate': None,
 'notes': '',
 'type': 'Optimizers & Inverters',
 'location': {'country': 'United States',
  'state': 'Arizona',
  'city': 'Centennial Park',
  'address': 'Taylor Court 1745',
  'address2': '',
  'zip': '86021',
  'timeZone': 'America/Phoenix',
  'countryCode': 'US',
  'stateCode': 'AZ'},
 'alertQuantity': 0,
 'primaryModule': {'manufacturerName': 'RENESOLA',
  'modelName': 'SE11400A-US',
  'maximumPower': 310.0,
  'temperatureCoef': -0.4},
 'uris': {'PUBLIC_URL': 'https://monitoring.solaredge.com/solaredge-web/p/public?name=Alan Knudson Canopy',
  'DATA_PERIOD': '/site/103941/dataPeriod',
  'DETAILS': '/site/103941/details',
  'OVERVIEW': '/site/103941/overview'},
 'publicSettings': {'name': 'Alan Knudson Canopy', 'isPublic': True}}

In [40]:
production_df = read_production_data(site['name'])
production_df

Unnamed: 0,date,value
0,2018-02-01 00:00:00,
1,2018-02-01 00:15:00,
2,2018-02-01 00:30:00,
3,2018-02-01 00:45:00,
4,2018-02-01 01:00:00,
...,...,...
2875,2019-06-30 22:45:00,
2876,2019-06-30 23:00:00,
2877,2019-06-30 23:15:00,
2878,2019-06-30 23:30:00,


We want to get rid of all values that report a NaN. We will probably eventually discard all values that within a few hours of sunrise and sunset

In [43]:
# remove null values (NaN)
production_df_c = production_df.dropna(how='any', axis=0)
# remove 0 values
production_df_c = production_df[(production_df != 0).all(1)]
production_df_c

Unnamed: 0,date,value
30,2018-02-01 07:30:00,2.0
31,2018-02-01 07:45:00,35.0
32,2018-02-01 08:00:00,90.0
33,2018-02-01 08:15:00,211.0
34,2018-02-01 08:30:00,376.0
...,...,...
2858,2019-06-30 18:30:00,167.0
2859,2019-06-30 18:45:00,76.0
2860,2019-06-30 19:00:00,38.0
2861,2019-06-30 19:15:00,21.0


Because the historical weather data only comes in 15 minute increments, we'll want to resample the production data to be every hour. First we'll want to set the index of our data frame to be the date. We'll have to set the 'date' column to be a datetime object

In [52]:
production_df_t = production_df_c.copy()
production_df_t['date'] = pd.to_datetime(production_df_c['date'])
production_df_t = production_df_t.set_index('date')
production_df_t = production_df_t.tz_localize(site['location']['timeZone'])
production_df_t.head()

Unnamed: 0_level_0,value
date,Unnamed: 1_level_1
2018-02-01 07:30:00-07:00,2.0
2018-02-01 07:45:00-07:00,35.0
2018-02-01 08:00:00-07:00,90.0
2018-02-01 08:15:00-07:00,211.0
2018-02-01 08:30:00-07:00,376.0


Now we can do the resampling

In [47]:
production_df_r = production_df_t.resample('1H').sum()
production_df_r

Unnamed: 0_level_0,value
date,Unnamed: 1_level_1
2015-02-09 13:00:00-07:00,1213.25390
2015-02-09 14:00:00-07:00,6966.02480
2015-02-09 15:00:00-07:00,5802.51400
2015-02-09 16:00:00-07:00,3979.07103
2015-02-09 17:00:00-07:00,1513.89170
...,...
2020-01-21 13:00:00-07:00,2241.00000
2020-01-21 14:00:00-07:00,2123.00000
2020-01-21 15:00:00-07:00,984.00000
2020-01-21 16:00:00-07:00,513.00000


Now we can read the historical weather data in for the same site

In [48]:
def read_weather_data(house):
    path = f'weather_data/{house}'
    weather_df = pd.read_csv(os.path.join(path, 'weather_data_hourly.csv'))
    return weather_df

In [51]:
weather_df = read_weather_data(site['name'])
weather_df

Unnamed: 0,date,precipIntensity,precipProbability,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,windBearing,cloudCover,uvIndex,visibility
0,2015-01-31 00:00:00,0.0053,0.75,39.47,37.12,37.41,0.92,,3.58,34.0,1.00,0.0,9.997
1,2015-01-31 01:00:00,0.0004,0.31,41.09,38.63,37.40,0.87,,3.94,68.0,1.00,0.0,9.997
2,2015-01-31 02:00:00,0.0000,0.00,39.62,39.62,38.09,0.94,,2.94,44.0,1.00,0.0,9.997
3,2015-01-31 03:00:00,0.0000,0.00,39.40,37.45,37.75,0.94,,3.20,72.0,1.00,0.0,9.997
4,2015-01-31 04:00:00,0.0000,0.00,39.00,35.66,39.00,1.00,,4.56,41.0,1.00,0.0,9.997
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43574,2020-01-20 19:00:00,0.0014,0.01,43.24,36.28,22.29,0.43,1021.1,13.99,70.0,0.79,0.0,10.000
43575,2020-01-20 20:00:00,0.0014,0.01,42.29,35.59,22.37,0.45,1021.4,12.38,70.0,0.85,0.0,10.000
43576,2020-01-20 21:00:00,0.0002,0.01,42.25,35.67,23.28,0.47,1021.3,12.02,71.0,0.98,0.0,10.000
43577,2020-01-20 22:00:00,0.0000,0.00,41.98,35.35,23.99,0.49,1021.1,11.97,72.0,0.93,0.0,10.000


The time appears to be in the same timezone as the production data. For now, I am going to assume that that timezone is correct, and use it to merge the data. We'll want to convert the timezone and set it as the index as we did before

In [55]:
weather_df_t = weather_df.copy()
weather_df_t['date'] = pd.to_datetime(weather_df['date'])
weather_df_t = weather_df_t.set_index('date')
weather_df_t = weather_df_t.tz_localize(site['location']['timeZone'])
weather_df_t.head()

Unnamed: 0_level_0,precipIntensity,precipProbability,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,windBearing,cloudCover,uvIndex,visibility
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-31 00:00:00-07:00,0.0053,0.75,39.47,37.12,37.41,0.92,,3.58,34.0,1.0,0.0,9.997
2015-01-31 01:00:00-07:00,0.0004,0.31,41.09,38.63,37.4,0.87,,3.94,68.0,1.0,0.0,9.997
2015-01-31 02:00:00-07:00,0.0,0.0,39.62,39.62,38.09,0.94,,2.94,44.0,1.0,0.0,9.997
2015-01-31 03:00:00-07:00,0.0,0.0,39.4,37.45,37.75,0.94,,3.2,72.0,1.0,0.0,9.997
2015-01-31 04:00:00-07:00,0.0,0.0,39.0,35.66,39.0,1.0,,4.56,41.0,1.0,0.0,9.997


we can merge the dataframes on their common index to get a combined df

In [59]:
combined_df = weather_df_t.merge(production_df_t, left_index=True, right_index=True, how='inner')
combined_df = combined_df.rename(columns={'value': 'production'})
combined_df

Unnamed: 0_level_0,precipIntensity,precipProbability,temperature,apparentTemperature,dewPoint,humidity,pressure,windSpeed,windBearing,cloudCover,uvIndex,visibility,production
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-02-09 14:00:00-07:00,0.0000,0.00,65.76,65.76,31.81,0.28,,10.85,218.0,0.00,4.0,9.997,1813.9670
2015-02-09 15:00:00-07:00,0.0000,0.00,66.65,66.65,27.21,0.23,,10.98,230.0,0.00,2.0,9.997,1582.5759
2015-02-09 16:00:00-07:00,0.0000,0.00,68.40,68.40,26.36,0.21,,11.05,250.0,0.00,1.0,9.997,1198.3170
2015-02-09 17:00:00-07:00,0.0000,0.00,66.97,66.97,29.54,0.25,,8.96,248.0,0.00,0.0,9.997,627.7450
2015-02-09 18:00:00-07:00,0.0000,0.00,63.22,63.22,32.18,0.31,,7.68,253.0,0.00,0.0,9.997,25.5408
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-01-20 13:00:00-07:00,0.0000,0.00,49.37,43.93,15.50,0.26,1023.1,14.61,72.0,0.51,3.0,10.000,631.0000
2020-01-20 14:00:00-07:00,0.0007,0.01,51.21,51.21,14.57,0.23,1022.0,12.58,75.0,0.64,3.0,10.000,182.0000
2020-01-20 15:00:00-07:00,0.0000,0.00,50.14,50.14,16.30,0.26,1021.6,14.27,71.0,0.70,2.0,10.000,505.0000
2020-01-20 16:00:00-07:00,0.0000,0.00,48.31,43.21,18.91,0.31,1021.2,12.27,69.0,0.67,1.0,10.000,251.0000


In [60]:
combined_df.to_csv("data/test_combination.csv")