In [2]:
import pandas as pd

We want to read in the site_information.json metadata file to get information about all of the sites, including names, locations, and timezones. We'll keep this in a dictionary site_information. 

# Experiment

In [3]:
import json
with open('site_information.json', 'r') as file:
    site_information = json.load(file)
site_information

{'sites': {'count': 14,
  'site': [{'id': 103941,
    'name': 'Alan Knudson House',
    'accountId': 53921,
    'status': 'Active',
    'peakPower': 10.36,
    'lastUpdateTime': '2020-01-21',
    'currency': 'USD',
    'installationDate': '2015-01-31',
    'ptoDate': None,
    'notes': '',
    'type': 'Optimizers & Inverters',
    'location': {'country': 'United States',
     'state': 'Arizona',
     'city': 'Centennial Park',
     'address': 'Taylor Court 1745',
     'address2': '',
     'zip': '86021',
     'timeZone': 'America/Phoenix',
     'countryCode': 'US',
     'stateCode': 'AZ'},
    'alertQuantity': 0,
    'primaryModule': {'manufacturerName': 'RENESOLA',
     'modelName': 'SE11400A-US',
     'maximumPower': 310.0,
     'temperatureCoef': -0.4},
    'uris': {'PUBLIC_URL': 'https://monitoring.solaredge.com/solaredge-web/p/public?name=Alan Knudson Canopy',
     'DATA_PERIOD': '/site/103941/dataPeriod',
     'DETAILS': '/site/103941/details',
     'OVERVIEW': '/site/103941/over

Read in both data frames to get an idea of what we need to combine. We'll start first with the production data

In [4]:
import os

def read_production_data(house):
    path = f'data/{house}'
    dfs = []
#     production_df = pd.read_csv(f'data/{house}')
    for subdir, dir, files in os.walk(path):
        for file in files:
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path)
            dfs.append(df)
    production_df = pd.concat(dfs)
    return production_df

Need to construct a generic function that will combine the data for any site
For now, I'm going to store all of the combined data into multiple different sets
This data needs to be the final data that we are going to run our regression model on
This data also needs to be fully cleaned before we move it to regression
We will want to experiment with different data to see how the accuracy of the model improves. 
Some data sets might just include all sun hours. Some might just include the most relevant sun hours, whatever that means
We will also need to think about cross validation if we perform any, and how we want to split the data into train, validate, and test. I don't know that we'll need to perform cross validation on the simple model, so we just need to make sure that we get an even split into train and test for the first model run. 

Steps
1. Put all of the code into one function (or class) that will read in data from both sources, combine them together. We will save cleaning for later. Right now, we are going to store as much information as we feel like doing. 
2. Run all of the sites through that algorithm, and save the final file in as an actual CSV in the data folder. 
3. Start a new notebook, do some initial cleaning, and run through a simple linear regression model. 

In [5]:
site = site_information['sites']['site'][0]
site

{'id': 103941,
 'name': 'Alan Knudson House',
 'accountId': 53921,
 'status': 'Active',
 'peakPower': 10.36,
 'lastUpdateTime': '2020-01-21',
 'currency': 'USD',
 'installationDate': '2015-01-31',
 'ptoDate': None,
 'notes': '',
 'type': 'Optimizers & Inverters',
 'location': {'country': 'United States',
  'state': 'Arizona',
  'city': 'Centennial Park',
  'address': 'Taylor Court 1745',
  'address2': '',
  'zip': '86021',
  'timeZone': 'America/Phoenix',
  'countryCode': 'US',
  'stateCode': 'AZ'},
 'alertQuantity': 0,
 'primaryModule': {'manufacturerName': 'RENESOLA',
  'modelName': 'SE11400A-US',
  'maximumPower': 310.0,
  'temperatureCoef': -0.4},
 'uris': {'PUBLIC_URL': 'https://monitoring.solaredge.com/solaredge-web/p/public?name=Alan Knudson Canopy',
  'DATA_PERIOD': '/site/103941/dataPeriod',
  'DETAILS': '/site/103941/details',
  'OVERVIEW': '/site/103941/overview'},
 'publicSettings': {'name': 'Alan Knudson Canopy', 'isPublic': True}}

In [6]:
production_df = read_production_data(site['name'])
production_df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if sys.path[0] == '':


Unnamed: 0,apparentTemperature,cloudCover,date,dewPoint,humidity,ozone,precipAccumulation,precipIntensity,precipProbability,precipType,pressure,production,temperature,uvIndex,value,visibility,windBearing,windGust,windSpeed
0,,,2018-02-01 00:00:00,,,,,,,,,,,,,,,,
1,,,2018-02-01 00:15:00,,,,,,,,,,,,,,,,
2,,,2018-02-01 00:30:00,,,,,,,,,,,,,,,,
3,,,2018-02-01 00:45:00,,,,,,,,,,,,,,,,
4,,,2018-02-01 01:00:00,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,,,2019-06-30 22:45:00,,,,,,,,,,,,,,,,
2876,,,2019-06-30 23:00:00,,,,,,,,,,,,,,,,
2877,,,2019-06-30 23:15:00,,,,,,,,,,,,,,,,
2878,,,2019-06-30 23:30:00,,,,,,,,,,,,,,,,


Because the historical weather data only comes in 15 minute increments, we'll want to resample the production data to be every hour. First we'll want to set the index of our data frame to be the date. We'll have to set the 'date' column to be a datetime object

In [7]:
production_df_t = production_df.copy()
production_df_t['date'] = pd.to_datetime(production_df['date'])
production_df_t = production_df_t.set_index('date')
production_df_t = production_df_t.tz_localize(site['location']['timeZone'])
production_df_t.head()

TypeError: index is not a valid DatetimeIndex or PeriodIndex

Now we can do the resampling

In [None]:
production_df_r = production_df_t.resample('1H').sum()
production_df_r

Now we can read the historical weather data in for the same site

In [None]:
def read_weather_data(house):
    path = f'weather_data/{house}'
    weather_df = pd.read_csv(os.path.join(path, 'weather_data_hourly.csv'))
    return weather_df

In [None]:
weather_df = read_weather_data(site['name'])
weather_df

The time appears to be in the same timezone as the production data. For now, I am going to assume that that timezone is correct, and use it to merge the data. We'll want to convert the timezone and set it as the index as we did before

In [None]:
weather_df_t = weather_df.copy()
weather_df_t['date'] = pd.to_datetime(weather_df['date'])
weather_df_t = weather_df_t.set_index('date')
weather_df_t = weather_df_t.tz_localize(site['location']['timeZone'])
weather_df_t.head()

we can merge the dataframes on their common index to get a combined df

In [None]:
combined_df = weather_df_t.merge(production_df_t, left_index=True, right_index=True, how='inner')
combined_df = combined_df.rename(columns={'value': 'production'})
combined_df

In [None]:
combined_df.to_csv("data/test_combination.csv")

# Data Merge

In [8]:
import os

def read_production_data(house):
    path = f'../SolarEdgeDataGathering/data/{house}'
    dfs = []
#     production_df = pd.read_csv(f'data/{house}')
    for subdir, dir, files in os.walk(path):
        for file in files:
            file_path = os.path.join(subdir, file)
            df = pd.read_csv(file_path)
            dfs.append(df)
    production_df = pd.concat(dfs)
    return production_df

def read_weather_data(house):
    path = f'../SolarEdgeDataGathering/weather-data/{house}'
    weather_df = pd.read_csv(os.path.join(path, 'weather_data_hourly.csv'))
    return weather_df

def set_date_index(df, site):
    df_d = df.copy()
    df_d['date'] = pd.to_datetime(df['date'])
    df_d = df_d.set_index('date')
    df_d = df_d.tz_localize(site['location']['timeZone'], ambiguous='NaT', nonexistent='NaT')
    return df_d

def combine_prod_and_weather(site):
    # read production data
    production_df = read_production_data(site['name'])
    
    #set the date index
    production_df_t = set_date_index(production_df, site)
    
    # resmaple production data to hourly
    production_df_r = production_df_t.resample('1H').sum()
    
    # read in weather data
    weather_df = read_weather_data(site['name'])
    
    # set the date index
    weather_df_t = set_date_index(weather_df, site)
    
    # merge datasets
    combined_df = weather_df_t.merge(production_df_t, left_index=True, right_index=True, how='inner')
    combined_df = combined_df.rename(columns={'value': 'production'})
    
    site_id = site['id']
    folder = f'data/{site_id}'
    path = f'{folder}/production_weather_combination.csv'
    if not os.path.exists(path):
        os.mkdir(folder)
    combined_df.to_csv(path)

In [9]:
site_information

{'sites': {'count': 14,
  'site': [{'id': 103941,
    'name': 'Alan Knudson House',
    'accountId': 53921,
    'status': 'Active',
    'peakPower': 10.36,
    'lastUpdateTime': '2020-01-21',
    'currency': 'USD',
    'installationDate': '2015-01-31',
    'ptoDate': None,
    'notes': '',
    'type': 'Optimizers & Inverters',
    'location': {'country': 'United States',
     'state': 'Arizona',
     'city': 'Centennial Park',
     'address': 'Taylor Court 1745',
     'address2': '',
     'zip': '86021',
     'timeZone': 'America/Phoenix',
     'countryCode': 'US',
     'stateCode': 'AZ'},
    'alertQuantity': 0,
    'primaryModule': {'manufacturerName': 'RENESOLA',
     'modelName': 'SE11400A-US',
     'maximumPower': 310.0,
     'temperatureCoef': -0.4},
    'uris': {'PUBLIC_URL': 'https://monitoring.solaredge.com/solaredge-web/p/public?name=Alan Knudson Canopy',
     'DATA_PERIOD': '/site/103941/dataPeriod',
     'DETAILS': '/site/103941/details',
     'OVERVIEW': '/site/103941/over

In [10]:
for site in site_information['sites']['site']:
    combine_prod_and_weather(site)

FileNotFoundError: [Errno 2] File b'../SolarEdgeDataGathering/weather-data/Ronald L. McClellan Sr./weather_data_hourly.csv' does not exist: b'../SolarEdgeDataGathering/weather-data/Ronald L. McClellan Sr./weather_data_hourly.csv'

In [14]:
import shutil

for site in site_information['sites']['site']:
    site_id = site['id']
    name = site['name']
    if name == 'Ronald L. McClellan Sr.':
        name = 'Ronald L. McClellan Sr'
        print(site_id)
    src = f'../SolarEdgeDataGathering/weather-data/{name}'
    dest = f'../SolarEdgeDataGathering/weather-data/{site_id}'
    try:
        shutil.copytree(src, dest)
    except (FileExistsError):
        print("File exists")

File exists
File exists
File exists
File exists
File exists
File exists
File exists
File exists
File exists
File exists
File exists
466851
File exists
File exists
File exists
