In [1]:
import sys 
sys.path.insert(0, '../DataCleaning')

import train_test_split as tts

In [12]:
import importlib
importlib.reload(tts)

<module 'train_test_split' from '../DataCleaning\\train_test_split.py'>

In [5]:
DATA_DIR = '../../data'

In [23]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [6]:
import json

with open('../../data/production_data/site_metadata.json', 'r') as file:
    s = file.read()
    site_md = json.loads(s)

In [8]:
from os.path import exists

splits = {}

#Warning, this takes a while because the join operation is implemented inefficently (like n^2 in the worst case.)
for site in site_md:
    site_id = site['id']
    irradiance_site_id = site['irradiance_site_id']
    tz_str = site['location']['timeZone']
    
    path_production = f"{DATA_DIR}/production_data/{site_id}/combination_data/production_weather_combination.csv"
    path_irradiance = f"{DATA_DIR}/irradiance_data/{irradiance_site_id}/irradiance_data.csv"
    
    if not exists(path_irradiance):
        print(site_id,irradiance_site_id)
        continue
    
    X,Y,title_row = tts.get_irradiance_WPI_data(path_production,path_irradiance, 4,3,tz_str)
    splits[site_id] = (X,Y)

1232644 116439


In [16]:
architecture_file = f"{DATA_DIR}/panel_architecture.csv"
site_areas = tts.get_total_area(architecture_file)

for site_id,area in site_areas.items():
    if site_id in splits:
        ([X_train,X_test],[Y_train,Y_test]) = splits[site_id]
        X_train = tts.append_site_features(X_train,[area])
        X_test = tts.append_site_features(X_test,[area])
        splits[site_id] = ([X_train,X_test],[Y_train,Y_test])

title_row.append("area")

dict_keys([103941, 787197, 238320, 349060, 477834, 641826, 896164, 717193, 627759, 569932, 466851, 256177, 505347])


In [26]:
print(title_row, len(title_row))

['day', 'hour', 'precipIntensity', 'precipProbability', 'precipAccumulation', 'precipType_', 'precipType_rain', 'precipType_snow', 'precipType_sleet', 'temperature', 'apparentTemperature', 'dewPoint', 'humidity', 'pressure', 'windSpeed', 'windBearing', 'windGust', 'cloudCover', 'uvIndex', 'visibility', 'ozone', 'GHI', 'DHI', 'DNI', 'area'] 25


First We'll do "Leave one Out" transfer learning. Here we'll train on all but one site and test on the site that we left out. This is true transfer learning.

In [40]:
def combine_site_data(splits,test_site_id="", combine_train_test=False):
    X_complete = []
    Y_complete = []
    for site_id,data in splits.items():
        if test_site_id and test_site_id == site_id:
            continue
        else:
            ([X_train,X_test],[Y_train,Y_test]) = data
            
            X_complete.append(X_train)
            Y_complete.append(Y_train)
            
            if combine_train_test:
                X_complete.append(X_test)
                Y_complete.append(Y_test)
                
    X_complete = np.vstack(X_complete)
    Y_complete = np.hstack(Y_complete)
    
    return X_complete,Y_complete

In [41]:
for test_site_id,test_data in splits.items():
    X_train,Y_train = combine_site_data(splits,test_site_id)
    ([_,X_test],[_,Y_test]) = test_data
    
    model = LinearRegression()
    model.fit(X_train,Y_train)
    
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test,Y_pred))
    
    print(test_site_id,rmse)

103941 1400.8272649084543
787197 1256.8061540212614
238320 1222.999208104408
349060 1048.646287383282
477834 1070.8764195529227
641826 1858.6765343981606
896164 1039.3071796345234
717193 3077.0001917679583
627759 937.4969896219927
569932 1237.4979970079241
466851 815.0715863158354
256177 4364.060577656164
505347 6214.344513455739


In [42]:
#See if more data helps...
for test_site_id,test_data in splits.items():
    X_train,Y_train = combine_site_data(splits,test_site_id,combine_train_test=True)
    ([_,X_test],[_,Y_test]) = test_data
    
    model = LinearRegression()
    model.fit(X_train,Y_train)
    
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test,Y_pred))
    
    print(test_site_id,rmse)

103941 1392.9429043978641
787197 1248.656185840077
238320 1219.767276850132
349060 1051.0466879324451
477834 1070.083856252635
641826 1863.9611610854658
896164 1043.306893718434
717193 3079.7258895441305
627759 946.6457806107901
569932 1237.9841984598497
466851 809.4142230218305
256177 4380.551109302946
505347 6222.457736998086


Now We'll see if mixining together all the data (including the training data for the site itself!) helps us fit a better model.

In [43]:
X_train,Y_train = combine_site_data(splits)
for test_site_id,test_data in splits.items():
    ([_,X_test],[_,Y_test]) = test_data
    
    model = LinearRegression()
    model.fit(X_train,Y_train)
    
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test,Y_pred))
    
    print(test_site_id,rmse)

103941 952.0968296772322
787197 1165.668175007028
238320 1050.7922646235518
349060 897.3065242058341
477834 1013.8800435011257
641826 1706.3790990403525
896164 999.7169369515028
717193 2985.225929343497
627759 893.9676342391142
569932 1140.9026932644833
466851 788.5029116718143
256177 2336.5932492281554
505347 5406.114033153688
