In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt

# Load your data
df = pd.read_csv('solar_generation_selected_features.csv')


In [3]:
# Select your features and target
X = df.drop(['Solar Production (kWh)'], axis=1)
y = df['Solar Production (kWh)']

# Split the data into training and testing sets without scaling yet to preserve DataFrame structure
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=42)

# Assign labels to each set in the original DataFrame
df['Set'] = 'Train'  # Default to 'Train'
df.loc[X_test.index, 'Set'] = 'Test'  # Assign 'Test' to test set
df.loc[X_val.index, 'Set'] = 'Val'  # Assign 'Val' to validation set

# Initialize the scaler and scale the features now
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


In [4]:
# Initialize the Random Forest Regressor
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
random_forest_model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = random_forest_model.predict(X_val)


In [5]:
# Evaluate the model on the validation set
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = np.sqrt(mse_val)
r2_val = r2_score(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)

# Print validation performance metrics
print(f'Random Forest Validation RMSE: {rmse_val}')
print(f'Random Forest Validation R^2: {r2_val}')
print(f'Random Forest Validation MAE: {mae_val}')

Random Forest Validation RMSE: 0.40892234155904406
Random Forest Validation R^2: 0.9342767570487084
Random Forest Validation MAE: 0.19400230572160546


In [6]:
# Predict on the test data
y_pred = random_forest_model.predict(X_test)

# Evaluate the model on the test set
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print test performance metrics
print(f'Random Forest Test RMSE: {rmse}')
print(f'Random Forest Test R^2: {r2}')
print(f'Random Forest Test MAE: {mae}')


Random Forest Test RMSE: 0.44187313967921366
Random Forest Test R^2: 0.9293096593286214
Random Forest Test MAE: 0.20611450511945392


In [12]:
#generating predictions for the entire dataset
# Predict solar production for the entire dataset
df['Predicted Solar Production (kWh)'] = random_forest_model.predict(X)


In [13]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(df['Solar Production (kWh)'], df['Predicted Solar Production (kWh)'])

# Calculate Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = mean_squared_error(df['Solar Production (kWh)'], df['Predicted Solar Production (kWh)'])
rmse = np.sqrt(mse)

# Calculate R-squared (R²)
r2 = r2_score(df['Solar Production (kWh)'], df['Predicted Solar Production (kWh)'])
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')


Mean Absolute Error (MAE): 0.12353989071038252
Root Mean Squared Error (RMSE): 0.2967965383805413
R-squared (R²): 0.9668285331139966


In [14]:
#determining threshold recommendation

charge_threshold = df['Predicted Solar Production (kWh)'].quantile(0.75)  # Top 25% production times
df['Charge Recommendation'] = (df['Predicted Solar Production (kWh)'] >= charge_threshold).replace({True: 'Recommended', False: 'Not Recommended'})


In [15]:
df.to_csv('solar_predictions_with_recommendations.csv', index=False)


In [16]:
import pandas as pd

# Load the Excel file
df1 = pd.read_excel('Solar_Weather_FInal_Dataset.xlsx')

# Display the first few rows of the DataFrame to understand the structure, especially how the date is formatted
print(df1.head())


      name             datetime  temp  feelslike   dew  humidity  precip  \
0  Manteca  2023-08-01T00:00:00  74.5       74.5  47.8     38.80     0.0   
1  Manteca  2023-08-01T01:00:00  73.5       73.5  48.8     41.62     0.0   
2  Manteca  2023-08-01T02:00:00  71.7       71.7  49.9     46.29     0.0   
3  Manteca  2023-08-01T03:00:00  70.6       70.6  51.0     49.91     0.0   
4  Manteca  2023-08-01T04:00:00  67.9       67.9  52.7     58.26     0.0   

   precipprob preciptype  snow  ...  severerisk  conditions         icon  \
0           0        NaN     0  ...          10       Clear  clear-night   
1           0        NaN     0  ...          10       Clear  clear-night   
2           0        NaN     0  ...          10       Clear  clear-night   
3           0        NaN     0  ...          10       Clear  clear-night   
4           0        NaN     0  ...          10       Clear  clear-night   

                                            stations  Home Usage (kWh)  \
0  CI070,KSC

In [17]:
# Check that both dataframes have the same number of rows
print(df.shape[0], df1.shape[0])

5856 5856


In [18]:
# Copy the datetime column from df1 to df
df['datetime'] = df1['datetime']


In [19]:
df.to_csv('solar_predictions_with_recommendations.csv', index=False)

In [20]:
# Display the first few rows of the updated DataFrame to check the datetime column
print(df.head())


   temp   dew  humidity  solarradiation  windspeed  cloudcover  visibility  \
0  74.5  47.8     38.80               0        7.8         0.0         9.9   
1  73.5  48.8     41.62               0        8.6         0.0         9.9   
2  71.7  49.9     46.29               0        4.8         0.0         9.9   
3  70.6  51.0     49.91               0        6.9         0.0         9.8   
4  67.9  52.7     58.26               0        5.5         0.0         9.9   

   hour  day_of_week  month  ...  conditions_Overcast  \
0     0            1      8  ...                False   
1     1            1      8  ...                False   
2     2            1      8  ...                False   
3     3            1      8  ...                False   
4     4            1      8  ...                False   

   conditions_Partially cloudy  conditions_Rain  conditions_Rain, Overcast  \
0                        False            False                      False   
1                        False  

In [21]:
# Load the CSV file to check if the datetime column is present
df_check = pd.read_csv('solar_predictions_with_recommendations.csv')
print(df_check.head())


   temp   dew  humidity  solarradiation  windspeed  cloudcover  visibility  \
0  74.5  47.8     38.80               0        7.8         0.0         9.9   
1  73.5  48.8     41.62               0        8.6         0.0         9.9   
2  71.7  49.9     46.29               0        4.8         0.0         9.9   
3  70.6  51.0     49.91               0        6.9         0.0         9.8   
4  67.9  52.7     58.26               0        5.5         0.0         9.9   

   hour  day_of_week  month  ...  conditions_Overcast  \
0     0            1      8  ...                False   
1     1            1      8  ...                False   
2     2            1      8  ...                False   
3     3            1      8  ...                False   
4     4            1      8  ...                False   

   conditions_Partially cloudy  conditions_Rain  conditions_Rain, Overcast  \
0                        False            False                      False   
1                        False  