In [1]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sqlalchemy import create_engine, func
from Config import password

import seaborn as sns


In [2]:
alchemy_engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/solar_final')

In [3]:
#Data for model

input_df = pd.read_sql_query('SELECT * FROM solar_data_v2 AS sd WHERE "InvVDCin_Avg" > 0;', alchemy_engine)

#Drop Null rows

input_df = input_df.dropna()

input_df

Unnamed: 0,TIMESTAMP,Year,Month,Day,Hour,Minute,GHI,DHI,DNI,Wind Speed,Temperature,Cloud Type,Solar Zenith Angle,Surface Albedo,Wind Direction,Pressure,Relative Humidity,Precipitable Water,InvVDCin_Avg,Array_Tilt
0,2015-01-01 00:30:00,2015.0,1.0,1.0,0.0,30.0,0.0,0.0,0.0,1.6,-3.0,0.0,163.45,0.129,254.0,1000.0,73.50,0.381,21.00,5
1,2015-01-01 02:00:00,2015.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.7,-3.0,0.0,152.03,0.129,235.4,1000.0,73.72,0.380,20.96,5
2,2015-01-01 03:30:00,2015.0,1.0,1.0,3.0,30.0,0.0,0.0,0.0,1.6,-3.0,0.0,135.31,0.129,234.2,1000.0,74.25,0.381,21.16,5
3,2015-01-01 04:00:00,2015.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,1.6,-4.0,0.0,129.51,0.129,232.1,1000.0,80.47,0.382,20.72,5
4,2015-01-01 10:30:00,2015.0,1.0,1.0,10.0,30.0,416.0,64.0,885.0,3.7,1.0,0.0,66.57,0.129,233.7,1000.0,51.62,0.424,384.20,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158130,2018-03-18 10:00:00,2018.0,3.0,18.0,10.0,0.0,679.0,78.0,951.0,3.2,6.8,0.0,50.77,0.130,341.0,1002.0,0.70,42.070,363.60,10
158131,2018-03-18 11:30:00,2018.0,3.0,18.0,11.0,30.0,827.0,80.0,995.0,2.8,9.3,0.0,41.34,0.130,334.0,1001.0,0.70,38.250,344.30,10
158132,2018-03-18 12:00:00,2018.0,3.0,18.0,12.0,0.0,845.0,80.0,1000.0,2.7,10.0,0.0,40.09,0.130,331.0,1001.0,0.70,38.780,335.00,10
158133,2018-03-18 17:30:00,2018.0,3.0,18.0,17.0,30.0,122.0,38.0,552.0,0.8,8.2,0.0,81.28,0.130,300.0,998.0,0.70,66.090,370.60,10


In [4]:
# Rename Columns
solar_df = input_df
solar_df.rename(columns={
    "InvVDCin_Avg": "Voltage_Output", 
    'Wind Speed': 'Wind_Speed', 
    'Cloud Type': 'Cloud_Type', 
    'Solar Zenith Angle': 'Solar_Zenith_Angle',
    'Surface Albedo': 'Surface_Albedo', 
    'Wind Direction': 'Wind_Direction', 
    'Relative Humidity': 'Relative_Humidity',
    'Precipitable Water': 'Precipitable_Water',
}, inplace=True)

In [5]:
solar_df.columns

Index(['TIMESTAMP', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'GHI', 'DHI',
       'DNI', 'Wind_Speed', 'Temperature', 'Cloud_Type', 'Solar_Zenith_Angle',
       'Surface_Albedo', 'Wind_Direction', 'Pressure', 'Relative_Humidity',
       'Precipitable_Water', 'Voltage_Output', 'Array_Tilt'],
      dtype='object')

In [6]:
solar_df.dtypes

TIMESTAMP             datetime64[ns]
Year                         float64
Month                        float64
Day                          float64
Hour                         float64
Minute                       float64
GHI                          float64
DHI                          float64
DNI                          float64
Wind_Speed                   float64
Temperature                  float64
Cloud_Type                   float64
Solar_Zenith_Angle           float64
Surface_Albedo               float64
Wind_Direction               float64
Pressure                     float64
Relative_Humidity            float64
Precipitable_Water           float64
Voltage_Output               float64
Array_Tilt                     int64
dtype: object

In [7]:
# # Create a pairplot to visualize the relationship between features
# sns.pairplot(input_df[['GHI', 'DHI', 'DNI', 'Temperature', 'Solar_Zenith_Angle', 'Hour','Month', 'Precipitable_Water', 'Year', 'Cloud_Type']], diag_kind='kde')

In [8]:
# input_df = input_df.merge(encode_df,left_index=True, right_index=True)

final_df = solar_df.drop(["TIMESTAMP", "Year", "Month", "Day", "Hour", "Minute"],1)
final_df.describe().transpose()[['mean', 'std']]

Unnamed: 0,mean,std
GHI,201.062986,268.49099
DHI,76.016612,100.85142
DNI,223.409844,319.9315
Wind_Speed,2.041014,1.242326
Temperature,12.788201,10.67267
Cloud_Type,3.392997,3.166408
Solar_Zenith_Angle,85.021773,36.174497
Surface_Albedo,0.180285,0.176899
Wind_Direction,208.438487,98.035869
Pressure,998.45349,8.059151


In [9]:
final_df["Array_Tilt"].value_counts()


5     69126
20    62298
10    26231
Name: Array_Tilt, dtype: int64

In [10]:
# Define the features set.
X = final_df.copy()
X = X.drop(["Voltage_Output"], axis=1)
X.head()

Unnamed: 0,GHI,DHI,DNI,Wind_Speed,Temperature,Cloud_Type,Solar_Zenith_Angle,Surface_Albedo,Wind_Direction,Pressure,Relative_Humidity,Precipitable_Water,Array_Tilt
0,0.0,0.0,0.0,1.6,-3.0,0.0,163.45,0.129,254.0,1000.0,73.5,0.381,5
1,0.0,0.0,0.0,1.7,-3.0,0.0,152.03,0.129,235.4,1000.0,73.72,0.38,5
2,0.0,0.0,0.0,1.6,-3.0,0.0,135.31,0.129,234.2,1000.0,74.25,0.381,5
3,0.0,0.0,0.0,1.6,-4.0,0.0,129.51,0.129,232.1,1000.0,80.47,0.382,5
4,416.0,64.0,885.0,3.7,1.0,0.0,66.57,0.129,233.7,1000.0,51.62,0.424,5


In [11]:
# Define the target set.
y = final_df["Voltage_Output"].ravel()
y[:5]

array([ 21.  ,  20.96,  21.16,  20.72, 384.2 ])

In [12]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24)

In [13]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=500, random_state=24)

In [15]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [16]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [17]:
# # Calculating the confusion matrix.
# cm = confusion_matrix(y_test, predictions)

# # Create a DataFrame from the confusion matrix.
# cm_df = pd.DataFrame(
#     cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# cm_df

In [18]:
# # Calculating the accuracy score.
# acc_score = accuracy_score(y_test, predictions)

In [19]:
# # Displaying results
# print("Confusion Matrix")
# display(cm_df)
# print(f"Accuracy Score : {acc_score}")
# print("Classification Report")
# print(classification_report(y_test, predictions))

In [20]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_, X.columns
importancesdf = pd.DataFrame(importances)

importances_df = (importancesdf.swapaxes("index", "columns"))
# column_titles = ['0', '1']

# importances_df = importances_df.reindex(columns=column_titles)
importances_df

Unnamed: 0,0,1
0,0.00304,GHI
1,0.007871,DHI
2,0.00072,DNI
3,0.002461,Wind_Speed
4,0.00865,Temperature
5,0.001608,Cloud_Type
6,0.954227,Solar_Zenith_Angle
7,0.002289,Surface_Albedo
8,0.003032,Wind_Direction
9,0.001012,Pressure


In [21]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.9542273363086936, 'Solar_Zenith_Angle'),
 (0.00864982233109954, 'Temperature'),
 (0.00787080093780368, 'DHI'),
 (0.007538599066844303, 'Array_Tilt'),
 (0.003928061591391135, 'Precipitable_Water'),
 (0.0036218847381635014, 'Relative_Humidity'),
 (0.0030404614717412484, 'GHI'),
 (0.003032217224071133, 'Wind_Direction'),
 (0.0024613471083437394, 'Wind_Speed'),
 (0.002289417010835433, 'Surface_Albedo'),
 (0.0016075037099811185, 'Cloud_Type'),
 (0.001012487135239721, 'Pressure'),
 (0.0007200613657919379, 'DNI')]