## Regression Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Data -- Site 1
dfout_s1 = pd.read_csv('Data/PVData/Site1/PVOut_46834.csv', header=None)
dfout_s1 = dfout_s1.drop(axis=1, columns=[3,4,6,7,8,9,10,11,12,13])
dfout_s1 = dfout_s1.drop(axis=0, index=[item for item in range(32,49+1)])
dfout_s1 = dfout_s1.drop(axis=0, index=[0])
dfout_s1 = dfout_s1.loc[::-1].reset_index(drop=True)

dfsys_s1 = pd.read_csv('Data/PVData/Site1/PVSystem_46834.csv', header=None)

weather_s1 = pd.read_csv('Data/PVData/Site1/NSRDBout_s1.csv', header=None)
weather_s1 = weather_s1.drop(axis=0, index=[0,1,2])
weather_s1 = weather_s1.drop(axis=1, columns=[item for item in range(14,47)])

In [3]:
# Data -- Site 2
dfout_s2 = pd.read_csv('Data/PVData/Site2/PVOut_3445.csv', header=None)
dfout_s2 = dfout_s2.drop(axis=1, columns=[3,4,6,7,8,9,10,11,12,13])
dfout_s2 = dfout_s2.loc[::-1].reset_index(drop=True)

dfsys_s2 = pd.read_csv('Data/PVData/Site2/PVSystem_3445.csv', header=None)

weather_s2 = pd.read_csv('Data/PVData/Site2/NSRDBout_s2.csv', header=None)
weather_s2 = weather_s2.drop(axis=0, index=[0,1,2])
weather_s2 = weather_s2.drop(axis=1, columns=[item for item in range(14,47)])

In [4]:
def outputPanel(systemCapacity,energyGen):
    # We are calculating the output per panel assuming that each panel has a capactiy of 300 Watts
    wattage = 300
    newPanel = systemCapacity/wattage
    outPerPanel = energyGen/newPanel
    
    return int(round(int(outPerPanel)))

In [5]:
# Returns daily averages for meteorlogical data over the course of a specified month
## also added the energy output to this dataset
# One must specify the MONTH and the NUMBER OF DAYS IN THE MONTH
def daily(weatherData,pvOutData,pvSysData,year,month,numOfDaysMonth):
    
    monthstr = str(month)
    
    # Start of with monthly dataset
    mask = weatherData[2].values == monthstr
    pos = np.flatnonzero(mask)
    monthData = weatherData.iloc[pos]

    dailyData = pd.DataFrame(columns = ['Year','Month','Day','lat','lon','GHI','DHI','DNI',
                                        'Energy Gen (watt hr)',
                                        'Wind Speed','Temperature','Solar Zenith Angle',
                                        'Pressure','Relative Humidity'])    
        
    # Now the Daily Datasets
    for i in range(1, numOfDaysMonth+1):
        # Weather Data
        dayMask = monthData[3].values == str(i)
        pos = np.flatnonzero(dayMask)
        dayData = monthData.iloc[pos]
            
        #PV Data
        gen = outputPanel(pvSysData.iloc[0,1], pvOutData.iloc[(i-1),1])
        
        #PV and Weather Data combined
        df2 = pd.DataFrame([{'Year':year, 'Month':month, 'Day':i, 
                             'lat':pvSysData.iloc[0,13],'lon':pvSysData.iloc[0,14],
                             'Energy Gen (watt hr)':gen, 
                             'GHI':average(dayData,6), 'DHI':average(dayData,7), 
                             'DNI':average(dayData,8), 'Wind Speed':average(dayData,9), 
                             'Temperature':average(dayData,10), 'Solar Zenith Angle':average(dayData,11),
                             'Pressure':average(dayData,12), 'Relative Humidity':average(dayData,13)}])
        
        dailyData = dailyData.append(df2, ignore_index=True)
        
    return dailyData

In [6]:
def average(data, col):
    co = data[col].tolist()
    intList = [float(item) for item in co]
    avg = sum(intList)/len(intList)
    return avg

In [7]:
dataset_s1  = daily(weather_s1,dfout_s1,dfsys_s1,2020,5,31)
dataset_s2 = daily(weather_s2,dfout_s2,dfsys_s2,2020,6,30)
dataset = dataset_s1.append(dataset_s2)

In [8]:
col = dataset.columns.to_list()
col

['Year',
 'Month',
 'Day',
 'lat',
 'lon',
 'GHI',
 'DHI',
 'DNI',
 'Energy Gen (watt hr)',
 'Wind Speed',
 'Temperature',
 'Solar Zenith Angle',
 'Pressure',
 'Relative Humidity']

## Prediction Data

## Multiple Regression Model

In [13]:
# load in variables
x = dataset[['GHI','Relative Humidity','Pressure','Temperature','Solar Zenith Angle','Wind Speed','DNI','DHI']]
y = dataset['Energy Gen (watt hr)']

In [14]:
## Load in regression model
regr = linear_model.LinearRegression()
regr.fit(x,y)

LinearRegression()

In [15]:
# r^2 value
regr.score(x,y)

0.930235077024612