# Surplux Solar Energy Rule Based Method

## Import Package

In [1]:
import os
import math
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
SEED = 5397
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

## Helper Functions

Before we write the code for adjusting the models, lets define a few helper functions.

In [3]:
# Mask: lon
def group_by_lon(df):
    return df.groupby(['Lon']) \
        .mean() \
        .reset_index() \
        .sort_values(by=['Lon'], ascending=False, ignore_index=True)
def filter_by_lon(df, lon):
    return df[df['Lon'].eq(lon)]
# Mask: lat
def group_by_lat(df):
    return df.groupby(['Lat']) \
        .mean() \
        .reset_index() \
        .sort_values(by=['Lat'], ascending=False, ignore_index=True)
def filter_by_lat(df, lat):
    return df[df['Lat'].eq(lat)]
# Mask: location
def group_by_location(df):
    return df.groupby(['Lat', 'Lon']) \
        .mean() \
        .reset_index() \
        .sort_values(by=['Lat', 'Lon'], ascending=False, ignore_index=True)
def filter_by_location(df, lat, lon):
    return df[df['Lat'].eq(lat) & df['Lon'].eq(lon)]
# Mask: module & capacity & location
def group_by_module_capacity_location(df):
    return df.groupby(['Module', 'Lat', 'Lon', 'Capacity']) \
        .mean() \
        .reset_index() \
        .sort_values(by=['Lat', 'Lon'], ascending=False, ignore_index=True)
def filter_by_module_capacity_location(df, module, capacity, lat, lon):
    return df[df['Lat'].eq(lat) & df['Lon'].eq(lon) & df['Module'].eq(module) & df['Capacity'].eq(capacity)]

In [4]:
# Processing of data provided by the organizer 
def data_preprocessing(data):
    data['Date'] = pd.to_datetime(data['Date'])
    # 1 kwh = 3.6 MJ
    # 1 MJ to kwh = 0.27778 kwh
    data['Irradiance'] = data['Irradiance'] / 3.6
    data['Irradiance_m'] = data['Irradiance_m'] / 1000
    # Capacity Factor compare the power generation efficiency between plants of different scale.  
    data['CapacityFactor'] = data['Generation']/data['Capacity']
    # Array Ratio compares the energy collected through the system to the total system load by dividing energy in by energy-out. 
    data['ArrayRatio'] = data['CapacityFactor']/data['Irradiance']
    data['ArrayRatio_m'] = data['CapacityFactor']/data['Irradiance_m']
    return data

In [5]:
# Evaluate
def rmse(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sqrt(((y_pred - y_true) ** 2).mean())

## Const & Inputs

Here are all of the parameters to change for the run.

In [6]:
# Top level data directory. Here we assume the format of the directory conforms to the ImageFolder structure
path = "."

In [7]:
# Outliers are determined based on the analysis results in analysis.ipynb
outliers = [
    {
        'irradiance': [],
        'generation': ['2021-09-10', '2021-09-14', '2021-09-21', '2021-09-23', '2021-10-03', '2021-10-04', '2021-10-05', '2021-10-06', '2021-10-07']
    },
    {
        'irradiance': [],
        'generation': []
    },
    {
        'irradiance': [],
        'generation': []
    },
    {
        'irradiance': [],
        'generation': []
    },
    {
        'irradiance': ['2020-10-14'],
        'generation': ['2021-01-27', '2021-01-28', '2021-05-14']
    },
    {
        'irradiance': ['2020-10-14'],
        'generation': ['2021-01-27', '2021-05-14']
    },
    {
        'irradiance': ['2021-05-25'],
        'generation': []
    },
    {
        'irradiance': ['2021-05-25'],
        'generation': []
    },
    {
        'irradiance': [],
        'generation': []
    },
    {
        'irradiance': ['2020-10-14'],
        'generation': ['2021-01-27', '2021-03-07']
    },
    {
        'irradiance': ['2021-05-25'],
        'generation': []
    },
    {
        'irradiance': [],
        'generation': []
    },
    {
        'irradiance': ['2021-08-04'],
        'generation': ['2021-07-19']
    },
    {
        'irradiance': ['2021-05-25'],
        'generation': []
    },
]

## Data Preprocessing: Load Data

Processing of data provided by the organizer.

In [11]:
train = pd.read_csv(os.path.join(path, 'data/train.csv'))
train = data_preprocessing(train)
train.describe()

Unnamed: 0,ID,Temp_m,Generation,Irradiance,Capacity,Lat,Lon,Angle,Irradiance_m,Temp,CapacityFactor,ArrayRatio,ArrayRatio_m
count,3584.0,2126.0,3584.0,3560.0,3584.0,3584.0,3584.0,3584.0,3584.0,3569.0,3584.0,3560.0,3584.0
mean,1792.5,42.59699,1339.483817,4.796601,350.535039,24.483453,120.759336,-20.575564,123.770042,25.722808,3.887639,0.848853,inf
std,1034.75601,10.953377,796.698529,1.894333,144.498892,0.482748,0.348226,53.058729,160.183435,5.341206,1.538227,0.71694,
min,1.0,11.8,17.0,0.036111,99.2,24.04,120.44,-160.0,0.0,6.9,0.077572,0.016551,0.000423
25%,896.75,34.0,575.0,3.658333,246.4,24.08,120.47,-31.0,5.367,22.0,2.99253,0.755726,0.018646
50%,1792.5,42.5,1268.0,5.213889,352.0,24.107,120.52,1.76,66.6895,28.2,4.293808,0.816619,0.053874
75%,2688.25,51.7,1957.0,6.247917,498.56,25.03,121.08,4.63,171.7385,30.0,5.039212,0.886738,0.77343
max,3584.0,68.5,6752.0,8.005556,499.8,25.11,121.26,22.0,1026.617,32.5,21.443089,30.141287,inf


In [12]:
test = pd.read_csv(os.path.join(path, 'data/test.csv'))
test = data_preprocessing(test)
test.describe()

Unnamed: 0,ID,Temp_m,Generation,Irradiance,Capacity,Lat,Lon,Angle,Irradiance_m,Temp,CapacityFactor,ArrayRatio,ArrayRatio_m
count,1539.0,1099.0,0.0,1539.0,1539.0,1539.0,1539.0,1539.0,1539.0,1356.0,0.0,0.0,0.0
mean,770.0,31.712011,,3.378431,335.654321,24.358381,120.680032,-17.387479,113.914567,18.863422,,,
std,444.415346,8.832432,,1.399051,132.48618,0.444123,0.309303,47.846896,124.256036,3.349447,,,
min,1.0,13.8,,0.261111,99.2,24.04,120.44,-160.0,0.004,12.3,,,
25%,385.5,25.1,,2.277778,267.52,24.07,120.47,-2.62,4.375,16.6,,,
50%,770.0,31.0,,3.866667,314.88,24.08,120.52,0.0,89.575,18.0,,,
75%,1154.5,37.8,,4.525,492.8,24.98,121.03,4.63,148.808,20.7,,,
max,1539.0,60.1,,5.611111,499.8,25.11,121.26,22.0,528.4,28.0,,,


In [13]:
# Remove outliers from the train data, before starting data packaging and model training.
train_group = group_by_module_capacity_location(train)
for i, row in train_group.iterrows():
    train_idf = filter_by_module_capacity_location(train, row['Module'], row['Capacity'], row['Lat'], row['Lon'])
    outliers1 = train_idf[train_idf['Date'].isin(outliers[i]['irradiance'])].index.tolist()
    outliers2 = train_idf[train_idf['Date'].isin(outliers[i]['generation'])].index.tolist()
    train = train[~(train.index.isin(outliers1))]
train_group = group_by_module_capacity_location(train)

## Irradiance to Generation

Apple generation by rule based method.  

Generation = Irradiance * ArrayRatio * Capacity

In [14]:
def apply_generation_by_array_ratio_with_location_module_capacity(row, refs):
    mask = refs['Lat'].eq(row['Lat']) & refs['Lon'].eq(row['Lon']) & refs['Module'].eq(row['Module']) & refs['Capacity'].eq(row['Capacity'])
    ref = refs[mask]
    return row['Irradiance'] * row['Capacity'] * ref.iloc[0]['ArrayRatio']
def apply_generation_by_array_ratio_with_location(row, refs):
    mask = refs['Lat'].eq(row['Lat']) & refs['Lon'].eq(row['Lon'])
    ref = refs[mask]
    return row['Irradiance'] * row['Capacity'] * ref.iloc[0]['ArrayRatio']

In [15]:
test['Generation'] = test.apply(
    lambda row: apply_generation_by_array_ratio_with_location_module_capacity(row, train_group), axis=1)


In [26]:
# test[['ID','Generation']].to_csv(os.path.join(path, f'submission/generation.csv'), index=False)