In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import Data

In [None]:
# Import data
from google.colab import drive
drive.mount('/content/drive')

# Option 1: Mount Google Drive
# import pandas as pd
# X_train = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2022_Train_Test/X_train.xlsx')
# X_test = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2022_Train_Test/X_test.xlsx')
# y_train = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2022_Train_Test/y_train.xlsx')
# y_test = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2022_Train_Test/y_test.xlsx')

# Option 2: Downloaded Files
X_train = pd.read_excel('X_train.xlsx')
X_test = pd.read_excel('X_test.xlsx')
y_train = pd.read_excel('y_train.xlsx')
y_test = pd.read_excel('y_test.xlsx')

Mounted at /content/drive


In [None]:
# Shape
print('X_train Shape:', X_train.shape)
print('X_test Shape:', X_test.shape)
print('y_train Shape:', y_train.shape)
print('y_test Shape:', y_test.shape)

X_train Shape: (21024, 5)
X_test Shape: (5256, 5)
y_train Shape: (21024, 1)
y_test Shape: (5256, 1)


# Create and Test model

In [None]:
# Import model
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
from sklearn.impute import SimpleImputer

# Fit the imputer to your training data and transform both training and testing data
imputer = SimpleImputer(strategy='mean')

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [None]:
# Implement model on combined data
# Initiate model
model = SVR(kernel='linear', C=1, epsilon=0.1)
# Train model
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
# Predict data (See training accuracy)
y_train_pred = model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

In [None]:
print('Training MSE:', mse_train)
print('Training R^2:', r2_train)

In [None]:
# Predict data (See testing accuracy - this one matters more!)
y_test_pred = model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

In [None]:
print('Testing MSE:', mse_test)
print('Testing R^2:', r2_test)

# With C = 1, epsilon = 0.1
#Testing MSE South: 3.7014322456512994
#Testing R^2 South: 0.9976774081484334

# With C = 0.1, epsilon = 0.01
#Testing MSE South: 4.370076051378616
#Testing R^2 South: 0.9972578444358713

In [None]:
# Visualize model's results
import matplotlib.pyplot as plt

# Scatter plot of actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)  # Diagonal line
plt.title('SVM - Combined Data')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

# Predicting with 2023 Data

In [None]:
# Import 2023 data
from google.colab import drive
drive.mount('/content/drive')

# Load from saved location
# South
import pandas as pd
X_train_23 = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2023 Data/X_train_23.xlsx')
X_test_23 = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2023 Data/X_test_23.xlsx')
y_train_23 = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2023 Data/y_train_23.xlsx')
y_test_23 = pd.read_excel('/content/drive/MyDrive/Machine Learning/Project/2023 Data/y_test_23.xlsx')

# Extract target variable as 1D arrays
y_train_23 = y_train_23['Energy'].values  # fixing added column issue
y_test_23 = y_test_23['Energy'].values

In [None]:
# Predict data
y_pred_23 = model.predict(X_test_23)  # Use X_test_23 for prediction
mse_test_23 = mean_squared_error(y_test_23, y_pred_23)  # Use y_test_23 for evaluation
r2_test_23 = r2_score(y_test_23, y_pred_23)
print('Test MSE South 2023:', mse_test_23)
print('Test R^2 South 2023:', r2_test_23)

In [None]:
# Visualize model's results
import matplotlib.pyplot as plt

# Scatter plot of actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test_23, y_pred_23, alpha=0.5)
plt.plot([y_test_23.min(), y_test_23.max()], [y_test_23.min(), y_test_23.max()], 'k--', lw=2)  # Diagonal line
plt.title('SVM - Combined Data for 2023')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

# Plotting Energy vs Hour

In [None]:
# Plotting Energy vs Hour
import matplotlib.pyplot as plt
import pandas as pd

# Assuming you have your actual and predicted values in y_test_23 and y_pred_23 respectively
# and a corresponding DataFrame X_test_23 with an 'Hour' column

# 1. Create a DataFrame to hold the data
results_df = pd.DataFrame({'Hour': X_test_23['Hour'], 'Actual': y_test_23, 'Predicted': y_pred_23})

# 2. Sort by hour
results_df = results_df.sort_values(by=['Hour']) # This will sort the dataframe based on hour which will help in plotting

# 3. Plot the actual and predicted values
plt.figure(figsize=(12, 6))
plt.plot(results_df['Hour'], results_df['Actual'], label='Actual', marker='o')
plt.plot(results_df['Hour'], results_df['Predicted'], label='Predicted', marker='x')
plt.xlabel('Hour')
plt.ylabel('Solar Energy')
plt.title('SVM - Energy Prediction 2023')
plt.legend()
plt.grid(True)
plt.xticks(range(24))  # Set x-axis ticks to represent hours 0-23
plt.show()

# Tune to find best parameters

In [None]:
#from sklearn.model_selection import GridSearchCV

#param_grid = {'C': [1, 10, 100], # Regularization parameter
              #'epsilon': [0.01, 0.1, 1]}

# Create and train GridSearch object
#grid_search = GridSearchCV(SVR(kernel='linear'), param_grid, cv=5, scoring='neg_mean_squared_error')
#grid_search.fit(X_train, y_train)

In [None]:
# Get best hyperparameters
#best_params = grid_search.best_params_
#print("Best Hyperparameters:", best_params)
#best_model = grid_search.best_estimator_

In [None]:
#y_pred = best_model.predict(X_test)
#mse = mean_squared_error(y_test, y_pred)
#r2 = r2_score(y_test, y_pred)
#print(f"Mean Squared Error: {mse}")
#print(f"R-squared: {r2}")