In [6]:
import sys 
sys.path.insert(0, '../DataCleaning')

import train_test_split as tts

In [7]:
import importlib
importlib.reload(tts)

<module 'train_test_split' from '../DataCleaning/train_test_split.py'>

In [8]:
DATA_DIR = '../../data'

In [9]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

In [10]:
import json

with open('../../data/production_data/site_metadata.json', 'r') as file:
    s = file.read()
    site_md = json.loads(s)

In [11]:
from os.path import exists

splits = {}

#Warning, this takes a while because the join operation is implemented inefficently (like n^2 in the worst case.)
for site in site_md:
    site_id = site['id']
    irradiance_site_id = site['irradiance_site_id']
    tz_str = site['location']['timeZone']
    
    path_production = f"{DATA_DIR}/production_data/{site_id}/combination_data/production_weather_combination.csv"
    path_irradiance = f"{DATA_DIR}/irradiance_data/{irradiance_site_id}/irradiance_data.csv"
    
    if not exists(path_irradiance):
        print(site_id,irradiance_site_id)
        continue
    
    X,Y,title_row = tts.get_irradiance_WPI_data(path_production,path_irradiance, 4,3,tz_str)
    splits[site_id] = (X,Y)

1232644 116439


In [12]:
architecture_file = f"{DATA_DIR}/panel_architecture.csv"
site_areas = tts.get_total_area(architecture_file)

for site_id,area in site_areas.items():
    if site_id in splits:
        ([X_train,X_test],[Y_train,Y_test]) = splits[site_id]
        X_train = tts.append_site_features(X_train,[area])
        X_test = tts.append_site_features(X_test,[area])
        splits[site_id] = ([X_train,X_test],[Y_train,Y_test])

title_row.append("area")

In [13]:
print(title_row, len(title_row))

['day', 'hour', 'precipIntensity', 'precipProbability', 'precipAccumulation', 'precipType_', 'precipType_rain', 'precipType_snow', 'precipType_sleet', 'temperature', 'apparentTemperature', 'dewPoint', 'humidity', 'pressure', 'windSpeed', 'windBearing', 'windGust', 'cloudCover', 'uvIndex', 'visibility', 'ozone', 'GHI', 'DHI', 'DNI', 'area'] 25


First We'll do "Leave one Out" transfer learning. Here we'll train on all but one site and test on the site that we left out. This is true transfer learning.

In [14]:
ir_results = pd.read_csv(f'Irradiance_results.csv')
ir_results = ir_results.astype({'site_id': str})
ir_results = ir_results.set_index('site_id')
ir_results

Unnamed: 0_level_0,irradiance_only,irr_time,weather_only,weather_time,weather_irr,all
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
103941,246.534541,225.476615,443.820703,333.608607,230.734887,214.282454
787197,458.741006,441.875406,676.095814,493.18983,420.689423,414.346982
238320,406.768778,400.295024,215.158797,194.184444,180.539331,179.062503
349060,195.15302,195.418662,366.227301,277.764976,173.514858,172.418634
477834,185.225738,182.273051,180.990243,165.603927,146.007381,141.265141
641826,1415.125092,1356.645292,1400.275253,1340.890427,1129.709809,1074.484228
896164,169.821445,173.107516,213.166462,209.297813,165.401586,166.997995
717193,2452.825849,2423.531521,2755.74348,2454.798066,2235.588756,2228.444875
627759,661.845473,562.320608,1158.978721,830.920466,604.447385,553.548312
569932,1144.587302,1107.587695,912.295012,824.118052,742.778756,714.950626


In [15]:
def combine_site_data(splits,test_site_id="", combine_train_test=False):
    X_complete = []
    Y_complete = []
    for site_id,data in splits.items():
        if test_site_id and test_site_id == site_id:
            continue
        else:
            ([X_train,X_test],[Y_train,Y_test]) = data
            
            X_complete.append(X_train)
            Y_complete.append(Y_train)
            
            if combine_train_test:
                X_complete.append(X_test)
                Y_complete.append(Y_test)
                
    X_complete = np.vstack(X_complete)
    Y_complete = np.hstack(Y_complete)
    
    return X_complete,Y_complete

In [19]:
result_data = {}

for test_site_id,test_data in splits.items():
    X_train,Y_train = combine_site_data(splits,test_site_id)
    ([_,X_test],[_,Y_test]) = test_data
    
    model = LinearRegression()
    model.fit(X_train,Y_train)
    
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test,Y_pred))
    result_data[str(test_site_id)] = rmse
result_series = pd.Series(result_data)
ir_results['t_loo'] = result_series
print(result_series)

103941    1408.297236
787197    1139.275025
238320    1216.435500
349060    1063.147773
477834    1084.975457
641826    1892.605357
896164     977.995193
717193    3167.109357
627759     987.753050
569932    1280.391308
466851     803.562978
256177    4347.582603
505347    6092.258390
dtype: float64


In [18]:
#See if more data helps...
result_data = {}
for test_site_id,test_data in splits.items():
    X_train,Y_train = combine_site_data(splits,test_site_id,combine_train_test=True)
    ([_,X_test],[_,Y_test]) = test_data
    
    model = LinearRegression()
    model.fit(X_train,Y_train)
    
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test,Y_pred))
    
    result_data[str(test_site_id)] = rmse

result_series = pd.Series(result_data)
print(result_series)


103941    1402.697293
787197    1149.726414
238320    1212.117695
349060    1056.698826
477834    1085.023808
641826    1890.250471
896164     985.566868
717193    3165.981571
627759     986.380159
569932    1278.676657
466851     806.291419
256177    4361.202753
505347    6092.422519
dtype: float64


Now We'll see if mixining together all the data (including the training data for the site itself!) helps us fit a better model.

In [21]:
result_data = {}

X_train,Y_train = combine_site_data(splits)
for test_site_id,test_data in splits.items():
    ([_,X_test],[_,Y_test]) = test_data
    
    model = LinearRegression()
    model.fit(X_train,Y_train)
    
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test,Y_pred))
    
    result_data[str(test_site_id)] = rmse 

result_series = pd.Series(result_data)
print(result_series)
ir_results['t_integrated'] = result_series

103941     962.852813
787197    1051.837168
238320    1046.116332
349060     906.907098
477834    1027.294537
641826    1740.671072
896164     936.944873
717193    3074.459545
627759     943.862437
569932    1185.081394
466851     777.267431
256177    2328.248795
505347    5290.500365
dtype: float64


In [22]:
ir_results

Unnamed: 0_level_0,irradiance_only,irr_time,weather_only,weather_time,weather_irr,all,t_loo,t_integrated
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
103941,246.534541,225.476615,443.820703,333.608607,230.734887,214.282454,1408.297236,962.852813
787197,458.741006,441.875406,676.095814,493.18983,420.689423,414.346982,1139.275025,1051.837168
238320,406.768778,400.295024,215.158797,194.184444,180.539331,179.062503,1216.4355,1046.116332
349060,195.15302,195.418662,366.227301,277.764976,173.514858,172.418634,1063.147773,906.907098
477834,185.225738,182.273051,180.990243,165.603927,146.007381,141.265141,1084.975457,1027.294537
641826,1415.125092,1356.645292,1400.275253,1340.890427,1129.709809,1074.484228,1892.605357,1740.671072
896164,169.821445,173.107516,213.166462,209.297813,165.401586,166.997995,977.995193,936.944873
717193,2452.825849,2423.531521,2755.74348,2454.798066,2235.588756,2228.444875,3167.109357,3074.459545
627759,661.845473,562.320608,1158.978721,830.920466,604.447385,553.548312,987.75305,943.862437
569932,1144.587302,1107.587695,912.295012,824.118052,742.778756,714.950626,1280.391308,1185.081394


In [24]:
ir_results.to_csv('Irradiance_results.csv')