In [1]:
import sys 
sys.path.insert(0, '../DataCleaning')

import train_test_split

In [15]:
import importlib
importlib.reload(train_test_split)

<module 'train_test_split' from '../DataCleaning\\train_test_split.py'>

In [3]:
DATA_DIR = '../../data'

In [4]:
import json

with open('../../data/production_data/site_metadata.json', 'r') as file:
    s = file.read()
    site_md = json.loads(s)

In [16]:
from os.path import exists

splits = {}

#Warning, this takes a while because the join operation is implemented inefficently (like n^2 in the worst case.)
for site in site_md:
    site_id = site['id']
    irradiance_site_id = site['irradiance_site_id']
    tz_str = site['location']['timeZone']
    
    path_production = f"{DATA_DIR}/production_data/{site_id}/combination_data/production_weather_combination.csv"
    path_irradiance = f"{DATA_DIR}/irradiance_data/{irradiance_site_id}/irradiance_data.csv"
    
    if not exists(path_irradiance):
        print(site_id,irradiance_site_id)
        continue
    
    X,Y,title_row = train_test_split.get_irradiance_WPI_data(path_production,path_irradiance, 4,3,tz_str)
    splits[site_id] = (X,Y)
    
    

24
1232644 116439
24
24
24
24
24
24
24
24
24
24
24
24


In [17]:
title_row

['day',
 'hour',
 'precipIntensity',
 'precipProbability',
 'precipAccumulation',
 'precipType_',
 'precipType_rain',
 'precipType_snow',
 'precipType_sleet',
 'temperature',
 'apparentTemperature',
 'dewPoint',
 'humidity',
 'pressure',
 'windSpeed',
 'windBearing',
 'windGust',
 'cloudCover',
 'uvIndex',
 'visibility',
 'ozone',
 'GHI',
 'DHI',
 'DNI']

In [8]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def mean_absoluate_percentage_error(y_true,y_pred):
    
    return np.mean( np.abs((y_true-y_pred) / y_true) ) * 100

In [20]:
with open("Irradiance_results.csv", "w+") as f:

    f.write("site_id,irradiance_only,irr_time,weather_only,weather_time,weather_irr,all\n")

    for site_id,data in splits.items():

        ([X_train,X_test],[Y_train,Y_test]) = data
        
        model = LinearRegression()
        X_train_irr_only,_ = train_test_split.keep_columns(X_train, title_row, [ 'GHI','DHI','DNI'])
        X_test_irr_only,_ = train_test_split.keep_columns(X_test, title_row, [ 'GHI','DHI','DNI'])
        
        model.fit(X_train_irr_only,Y_train)
        Y_pred = model.predict(X_test_irr_only)
        rmse_irr_only = np.sqrt(mean_squared_error(Y_test,Y_pred))
        
        model = LinearRegression()
        X_train_irr_time,_ = train_test_split.keep_columns(X_train, title_row, ['GHI','DHI','DNI','day','hour'])
        X_test_irr_time,_ = train_test_split.keep_columns(X_test, title_row, ['GHI','DHI','DNI','day','hour'])

        model.fit(X_train_irr_time,Y_train)
        Y_pred = model.predict(X_test_irr_time)
        rmse_irr_time = np.sqrt(mean_squared_error(Y_test,Y_pred))
        
        model = LinearRegression()
        X_train_weather_only,_ = train_test_split.remove_columns(X_train, title_row, ['GHI','DHI','DNI','day','hour'])
        X_test_weather_only,_ = train_test_split.remove_columns(X_test, title_row, ['GHI','DHI','DNI','day','hour'])
        
        model.fit(X_train_weather_only,Y_train)
        Y_pred = model.predict(X_test_weather_only)
        rmse_weather_only = np.sqrt(mean_squared_error(Y_test,Y_pred))
        
        model = LinearRegression()
        X_train_weather_time,_ = train_test_split.remove_columns(X_train, title_row, ['GHI','DHI','DNI'])
        X_test_weather_time,_ = train_test_split.remove_columns(X_test, title_row, ['GHI','DHI','DNI'])
        
        model.fit(X_train_weather_time,Y_train)
        Y_pred = model.predict(X_test_weather_time)
        rmse_weather_time = np.sqrt(mean_squared_error(Y_test,Y_pred))
        
        model = LinearRegression()
        X_train_weather_irr,_ = train_test_split.remove_columns(X_train, title_row, ['day','hour'])
        X_test_weather_irr,_ = train_test_split.remove_columns(X_test, title_row, ['day','hour'])
        
        model.fit(X_train_weather_irr,Y_train)
        Y_pred = model.predict(X_test_weather_irr)
        rmse_weather_irr = np.sqrt(mean_squared_error(Y_test,Y_pred))
        
        model = LinearRegression()
        model.fit(X_train,Y_train)
        Y_pred = model.predict(X_test)
        rmse_all = np.sqrt(mean_squared_error(Y_test,Y_pred))
        
        f.write(f"{site_id},{rmse_irr_only},{rmse_irr_time},{rmse_weather_only},{rmse_weather_time},{rmse_weather_irr},{rmse_all}\n")