# *Data Collection*

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shapely
import shapely.speedups
from sklearn import linear_model
import geopandas as gpd
import math
import seaborn as sns
from sklearn import preprocessing
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



In [2]:
# Data -- Site 1
dfout_s1 = pd.read_csv('Data/PVData/Site1/PVOut_46834.csv', header=None)
dfout_s1 = dfout_s1.drop(axis=1, columns=[3,4,6,7,8,9,10,11,12,13])
dfout_s1 = dfout_s1.drop(axis=0, index=[item for item in range(32,49+1)])
dfout_s1 = dfout_s1.drop(axis=0, index=[0])
dfout_s1 = dfout_s1.loc[::-1].reset_index(drop=True)

dfsys_s1 = pd.read_csv('Data/PVData/Site1/PVSystem_46834.csv', header=None)

weather_s1 = pd.read_csv('Data/PVData/Site1/NSRDBout_s1.csv', header=None)
weather_s1 = weather_s1.drop(axis=0, index=[0,1,2])
weather_s1 = weather_s1.drop(axis=1, columns=[item for item in range(14,47)])

In [3]:
# Data -- Site 2
dfout_s2 = pd.read_csv('Data/PVData/Site2/PVOut_3445.csv', header=None)
dfout_s2 = dfout_s2.drop(axis=1, columns=[3,4,6,7,8,9,10,11,12,13])
dfout_s2 = dfout_s2.loc[::-1].reset_index(drop=True)

dfsys_s2 = pd.read_csv('Data/PVData/Site2/PVSystem_3445.csv', header=None)

weather_s2 = pd.read_csv('Data/PVData/Site2/NSRDBout_s2.csv', header=None)
weather_s2 = weather_s2.drop(axis=0, index=[0,1,2])
weather_s2 = weather_s2.drop(axis=1, columns=[item for item in range(14,47)])

In [4]:
def outputPanel(systemCapacity,energyGen):
    # We are calculating the output per panel assuming that each panel has a capactiy of 300 Watts
    wattage = 300
    newPanel = systemCapacity/wattage
    outPerPanel = energyGen/newPanel
    
    return float((outPerPanel))

In [5]:
# Returns daily averages for meteorlogical data over the course of a specified month
## also added the energy output to this dataset
# One must specify the MONTH and the NUMBER OF DAYS IN THE MONTH
def daily(weatherData,pvOutData,pvSysData,year,month,numOfDaysMonth):
    
    monthstr = str(month)
    
    # Start of with monthly dataset
    mask = weatherData[2].values == monthstr
    pos = np.flatnonzero(mask)
    monthData = weatherData.iloc[pos]

    dailyData = pd.DataFrame(columns = ['Year','Month','Day','lat','lon','GHI','DHI','DNI',
                                        'Wind Speed','Temperature','Solar Zenith Angle',
                                        'Pressure','Relative Humidity',
                                        'Energy Gen'])    
        
    # Now the Daily Datasets
    for i in range(1, numOfDaysMonth+1):
        # Weather Data
        dayMask = monthData[3].values == str(i)
        pos = np.flatnonzero(dayMask)
        dayData = monthData.iloc[pos]
            
        #PV Data
        gen = outputPanel(pvSysData.iloc[0,1], pvOutData.iloc[(i-1),1])
        
        #PV and Weather Data combined
        df2 = pd.DataFrame([{'Year':year, 'Month':month, 'Day':i, 
                             'lat':pvSysData.iloc[0,13],'lon':pvSysData.iloc[0,14], 
                             'GHI':average(dayData,6), 'DHI':average(dayData,7), 
                             'DNI':average(dayData,8), 'Wind Speed':average(dayData,9), 
                             'Temperature':average(dayData,10), 'Solar Zenith Angle':average(dayData,11),
                             'Pressure':average(dayData,12), 'Relative Humidity':average(dayData,13),
                             'Energy Gen':gen,}])
        
        dailyData = dailyData.append(df2, ignore_index=True)
        
    return dailyData

In [6]:
def average(data, col):
    if col == int:
        col = int(col)
    elif col == str:
        col = str(col)
    co = data[col].tolist()
    intList = [float(item) for item in co]
    avg = sum(intList)/len(intList)
    return avg

In [7]:
dataset_s1  = daily(weather_s1,dfout_s1,dfsys_s1,2020,5,31)
dataset_s2 = daily(weather_s2,dfout_s2,dfsys_s2,2020,6,30)
dataset = dataset_s1.append(dataset_s2, ignore_index=True)

### Model 1: Multiple Regression Model

In [8]:
dataVal = dataset.drop(axis=1, columns=['lat','lon','Year','Month','Day','Pressure','Solar Zenith Angle'])

# copy the data
df_max_scaled = dataVal.copy()
  
# apply normalization techniques on Column 1
column = ['GHI','DHI','DNI','Wind Speed','Temperature','Relative Humidity']
df_max_scaled[column] = df_max_scaled[column]/df_max_scaled[column].abs().max()

In [9]:
# load in variables
x = df_max_scaled[['GHI','DNI','Wind Speed']]
y = dataset['Energy Gen']

In [10]:
## Load in regression model
linear_reg = linear_model.LinearRegression()
linear_reg.fit(x,y)

# r^2 value
r2 = linear_reg.score(x,y)
print('r squared: ', r2, '\nVariables: ', x.columns.to_list())

r squared:  0.7181800646348857 
Variables:  ['GHI', 'DNI', 'Wind Speed']


In [11]:
linear_reg.coef_

array([947.99321109, 618.68973121,  79.73818141])

In [12]:
linear_reg.intercept_

165.16409487008036

### Load in Prediction Data

In [17]:
d1 = pd.read_csv('Data/PredictionData/PredData_1.csv')
d2 = pd.read_csv('Data/PredictionData/PredData_2.csv')
d3 = pd.read_csv('Data/PredictionData/PredData_3.csv')
d4 = pd.read_csv('Data/PredictionData/PredData_4.csv')
d5 = pd.read_csv('Data/PredictionData/PredData_5.csv')
d6 = pd.read_csv('Data/PredictionData/PredData_6.csv')
d7 = pd.read_csv('Data/PredictionData/PredData_7.csv')
d8 = pd.read_csv('Data/PredictionData/PredData_8.csv')
d9 = pd.read_csv('Data/PredictionData/PredData_9.csv')
d10 = pd.read_csv('Data/PredictionData/PredData_10.csv')
d11 = pd.read_csv('Data/PredictionData/PredData_11.csv')
d12 = pd.read_csv('Data/PredictionData/PredData_12.csv')

d1 = d1.append(d2, ignore_index = True)
d1 = d1.append(d3, ignore_index = True)
d1 = d1.append(d4, ignore_index = True)
d1 = d1.append(d5, ignore_index = True)
d1 = d1.append(d6, ignore_index = True)
d1 = d1.append(d7, ignore_index = True)
d1 = d1.append(d8, ignore_index = True)
d1 = d1.append(d9, ignore_index = True)
d1 = d1.append(d10, ignore_index = True)
d1 = d1.append(d11, ignore_index = True)
d1 = d1.append(d12, ignore_index = True)

d1 = d1.drop(axis=1, columns=['Unnamed: 0'])
d1.to

Unnamed: 0,lat,lon,GHI,DHI,DNI,Wind Speed,Temperature,Pressure,Relative Humidity
0,42.796124,-124.474002,180.640183,53.793721,231.954338,1.639315,12.05089,1009.055251,82.150413
1,47.768376,-124.474002,141.106164,54.694977,165.148744,1.737785,10.177374,1002.506507,87.772402
2,48.016989,-124.474002,131.385502,54.74121,153.797717,0.845982,9.896621,1001.531621,86.860947
3,48.265601,-124.474002,133.414155,54.565753,154.143607,1.36766,10.052626,1005.158676,86.447709
4,40.061385,-123.89587,207.318836,51.478881,276.636872,0.911575,14.069943,987.13379,68.477123
5,40.309997,-123.89587,201.048858,52.320548,263.142123,1.014977,13.792363,987.883105,69.649751
6,40.55861,-123.89587,201.239726,50.81758,271.205365,0.468858,13.435639,971.701941,68.396098
7,40.807223,-123.89587,193.622146,53.632991,253.334132,0.708265,14.430719,1000.745548,66.342791
8,41.055835,-123.89587,195.742808,51.860274,259.20137,0.39274,12.837865,963.935959,69.531514
9,41.304448,-123.89587,194.978653,50.798973,260.972032,0.897363,12.471621,969.006735,72.849086


In [14]:
## User input to ask which variables to add

def PredOut(GHI,DNI,DHI,WindSpeed,Temp,Pressure,Humid):
    
    gen = pd.DataFrame(columns=['lat','lon','Predicted Generation'])

    
    GHI = GHI.to_list()
    DNI = DNI.to_list()
    WindSpeed = WindSpeed.to_list()
    Temp = Temp.to_list()
    Pressure = Pressure.to_list()
    Humid = Humid.to_list()
    
    lat = d1.lat.to_list()
    lon = d1.lon.to_list()

    for i in range(len(GHI)):

        la = lat[i]
        lo = lon[i]
        x = np.array([[GHI[i],
                       DNI[i],
                       Temp[i]]])
        out = float(linear_reg.predict(x))
        
        gen1 = pd.DataFrame([{'lat':la, 'lon':lo, 'Predicted Generation':out}])
        gen = gen.append(gen1,ignore_index=True)
        
    return gen     

In [15]:
# data = PredOut(d1['GHI'],d1['DHI'],d1['DNI'],d1['Wind Speed'],d1['Temperature'],d1['Pressure'],d1['Relative Humidity'])
data = PredOut(d1['GHI'],d1['DNI'],d1['Temperature'])

In [16]:
data.to_csv('FinalOutData.csv')