In [8]:
import sys 
sys.path.insert(0, '../DataCleaning')

import train_test_split

In [9]:
import importlib
importlib.reload(train_test_split)

<module 'train_test_split' from '../DataCleaning/train_test_split.py'>

In [10]:
DATA_DIR = '../../data'

In [11]:
import json

with open('../../data/production_data/site_metadata.json', 'r') as file:
    s = file.read()
    site_md = json.loads(s)

In [17]:
from os.path import exists

splits = {}

#Warning, this takes a while because the join operation is implemented inefficently (like n^2 in the worst case.)
for site in site_md:
    site_id = site['id']
    irradiance_site_id = site['irradiance_site_id']
    tz_str = site['location']['timeZone']
    
    path_production = f"{DATA_DIR}/production_data/raw/{site_id}/combination_data/production_weather_combination.csv"
    path_irradiance = f"{DATA_DIR}/irradiance_data/raw/{irradiance_site_id}/irradiance_data.csv"
    
    if not exists(path_irradiance):
        print(site_id,irradiance_site_id)
        continue
    
    X,Y,title_row = train_test_split.get_irradiance_WPI_data(path_production,path_irradiance, 4,3,tz_str)
    splits[site_id] = (X,Y)
    
    

1232644 116439


In [18]:
title_row

['day',
 'hour',
 'precipIntensity',
 'precipProbability',
 'precipAccumulation',
 'precipType_',
 'precipType_rain',
 'precipType_snow',
 'precipType_sleet',
 'temperature',
 'apparentTemperature',
 'dewPoint',
 'humidity',
 'pressure',
 'windSpeed',
 'windBearing',
 'windGust',
 'cloudCover',
 'uvIndex',
 'visibility',
 'ozone',
 'GHI',
 'DHI',
 'DNI']

In [19]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def mean_absoluate_percentage_error(y_true,y_pred):
    
    return np.mean( np.abs((y_true-y_pred) / y_true) ) * 100

In [26]:
import pickle
import os

def run_regression(site_id, data):
    ([X_train,X_test],[Y_train,Y_test]) = data

    model = LinearRegression()
    model.fit(X_train,Y_train)
    Y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test,Y_pred))
    rmse_norm = rmse / np.max(Y_test)

    return model, rmse_norm 

def save_model(site_id, model):
    d = f'../../models/production/{site_id}'

    if not os.path.exists(d):
        os.mkdir(d)

    filename = f'{d}/v1.sav'
    pickle.dump(model, open(filename, 'wb'))

In [27]:
for site, split in splits.items():
    model, rmse_norm = run_regression(site, split)
    print(f'RMSE Norm for site {site}: {rmse_norm}')
    print('saving model')
    save_model(site, model)

RMSE Norm for site 103941: 0.08617516488984574
saving model
RMSE Norm for site 787197: 0.13784302620638708
saving model
RMSE Norm for site 238320: 0.11149979860468122
saving model
RMSE Norm for site 349060: 0.0774483298763127
saving model
RMSE Norm for site 477834: 0.1460642753996398
saving model
RMSE Norm for site 641826: 0.1333587028370901
saving model
RMSE Norm for site 896164: 0.13672384235366947
saving model
RMSE Norm for site 717193: 0.14732528873132125
saving model
RMSE Norm for site 627759: 0.09262699169771013
saving model
RMSE Norm for site 569932: 0.12163420567135913
saving model
RMSE Norm for site 466851: 0.1154115891681261
saving model
RMSE Norm for site 256177: 0.1221341033221149
saving model
RMSE Norm for site 505347: 0.1173749200656108
saving model
