In [24]:
import sys 
sys.path.insert(0, '../DataCleaning')

import train_test_split


We want to run linear regression using the train test split for all systems and get the results.  
First need to load all systems train test splits. 

In [25]:
DATA_DIR = '../../data'

In [26]:
import json

train_test_splits = {}

with open('../../data/production_data/site_metadata.json', 'r') as file:
    s = file.read()
    site_md = json.loads(s)

In [27]:
import importlib
importlib.reload(train_test_split)

<module 'train_test_split' from '../DataCleaning/train_test_split.py'>

In [28]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

for site in site_md:
    site_id = site['id']
    path = f'{DATA_DIR}/production_data/{site_id}/combination_data/production_weather_combination.csv'

    train, test = train_test_split.split(path, 4, 3, 1)

    X_data, Y_data = train_test_split.to_vector([train, test])
    train_test_splits[site_id] = (X_data, Y_data)

{4: {'': 0, 'snow': 1, 'rain': 2, 'sleet': 3}}
{4: {'': 0, 'rain': 1, 'snow': 2}}
{4: {'': 0, 'rain': 1, 'snow': 2}}
{4: {'': 0, 'rain': 1}}
{4: {'': 0, 'rain': 1, 'sleet': 2, 'snow': 3}}
{4: {'': 0, 'rain': 1, 'snow': 2, 'sleet': 3}}
{4: {'': 0, 'rain': 1, 'snow': 2, 'sleet': 3}}
{4: {'rain': 0, '': 1}}
{4: {'': 0, 'rain': 1, 'snow': 2}}
{4: {'': 0, 'rain': 1, 'snow': 2, 'sleet': 3}}
{4: {'': 0, 'snow': 1, 'rain': 2}}
{4: {'': 0, 'rain': 1, 'snow': 2, 'sleet': 3}}
{4: {'': 0, 'rain': 1, 'snow': 2, 'sleet': 3}}
{4: {'': 0, 'rain': 1, 'snow': 2, 'sleet': 3}}


In [46]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils import check_array
import math

# https://stats.stackexchange.com/a/62511
def mean_absolute_percentage_error(y_true, y_pred):

    # y_true = check_array(y_true)
    # y_pred = check_array(y_pred)
    # y_true, y_pred = check_arrays(y_true, y_pred)

    ## Note: does not handle mix 1d representation
    #if _is_1d(y_true): 
    #    y_true, y_pred = _check_1d_array(y_true, y_pred)

    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

for site in site_md:
    model = LinearRegression()

    site_id = site['id']
    X_data, Y_data = train_test_splits[site_id]
    X_train, X_test = X_data[0], X_data[1]
    Y_train, Y_test = Y_data[0], Y_data[1]

    model.fit(X_train, Y_train)

    Y_prediction = model.predict(X_test)

    avg_production = np.mean(Y_test)
    Y_prediction_avg = np.full(Y_test.shape, avg_production)

    mse = mean_squared_error(Y_test, Y_prediction)
    rmse = math.sqrt(mse)
    mse_avg = mean_squared_error(Y_test, Y_prediction_avg)
    rmse_avg = math.sqrt(mse_avg)
    mae = mean_absolute_error(Y_test, Y_prediction)
    mape = mean_absolute_percentage_error(Y_test, Y_prediction)
    r2 = r2_score(Y_test, Y_prediction)

    print(f'System {site_id}:')

    print(f'\t Mean Squared Error: {mse}')
    print(f'\t Root Mean Squared Error: {rmse}')
    print()

    print(f'\t Averaged Mean Squared Error: {mse_avg}')
    print(f'\t Averaged Root Mean Squared Error: {rmse_avg}')
    print()

    print(f'\t Difference in average RMSE and model RMSE: {rmse / rmse_avg}')
    print()

    print(f'\t Mean Absolute Error: {mae}')
    print(f'\t Mean Absolute Percentage Error: {mape}')
    print()

    print(f'\t r2 score: {r2}')
    # print(f'\t Coefficents: {model.coef_}')

System 103941:
	 Mean Squared Error: 194133.48546153677
	 Root Mean Squared Error: 440.60581641818663

	 Averaged Mean Squared Error: 464147.49309376924
	 Averaged Root Mean Squared Error: 681.2837096935235

	 Difference in average RMSE and model RMSE: 0.6467288299266598

	 Mean Absolute Error: 353.1888355926919
	 Mean Absolute Percentage Error: 12907.437223030016

	 r2 score: 0.5817418205416935
System 1232644:
	 Mean Squared Error: 91575.7683290172
	 Root Mean Squared Error: 302.6148845133319

	 Averaged Mean Squared Error: 315480.6566591162
	 Averaged Root Mean Squared Error: 561.6766477779864

	 Difference in average RMSE and model RMSE: 0.5387706355791853

	 Mean Absolute Error: 241.7144207264198
	 Mean Absolute Percentage Error: 631.9700442562996

	 r2 score: 0.7097262022376007
System 787197:
	 Mean Squared Error: 557944.9544007698
	 Root Mean Squared Error: 746.9571302295533

	 Averaged Mean Squared Error: 1208337.0072723343
	 Averaged Root Mean Squared Error: 1099.243834311721

