In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("PJME_hourly.csv")


In [None]:
data

Unnamed: 0,Datetime,PJME_MW
0,2002-12-31 01:00:00,26498.0
1,2002-12-31 02:00:00,25147.0
2,2002-12-31 03:00:00,24574.0
3,2002-12-31 04:00:00,24393.0
4,2002-12-31 05:00:00,24860.0
...,...,...
145361,2018-01-01 20:00:00,44284.0
145362,2018-01-01 21:00:00,43751.0
145363,2018-01-01 22:00:00,42402.0
145364,2018-01-01 23:00:00,40164.0


In [None]:
plt.figure(figsize=(15, 6))
plt.plot(data['Datetime'], data['PJME_MW'])
plt.title('Hourly Energy Consumption')
plt.xlabel('Date')
plt.ylabel('Energy Consumption (MW)')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
X = data.drop('PJME_MW', axis=1)
y = data['PJME_MW']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(X_train.index, y_train, label='Train')
plt.plot(X_test.index, y_test, label='Test')
plt.title('Train/Test Split Based on Time')
plt.xlabel('Date')
plt.ylabel('Energy Consumption (MW)')
plt.legend()
plt.show()


In [None]:
X_train['Hour'] = X_train.index.hour
X_train['Day'] = X_train.index.day
X_train['Month'] = X_train.index.month

X_test['Hour'] = X_test.index.hour
X_test['Day'] = X_test.index.day
X_test['Month'] = X_test.index.month


In [None]:
sns.boxplot(x='Hour', y='PJME_MW', data=data)
plt.title('Energy Consumption by Hour')
plt.show()

sns.boxplot(x='Month', y='PJME_MW', data=data)
plt.title('Energy Consumption by Month')
plt.show()


In [None]:
model = xgb.XGBRegressor()
model.fit(X_train, y_train)


In [None]:
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=['Importance'])
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

predictions = model.predict(X_test)


In [None]:
X_test['Predictions'] = predictions
merged_data = pd.concat([X_test, y_test], axis=1)


In [None]:
merged_data['Error'] = merged_data['PJME_MW'] - merged_data['Predictions']
best_predictions = merged_data.nsmallest(5, 'Error')
worst_predictions = merged_data.nlargest(5, 'Error')
print("Best Predictions:")
print(best_predictions)
print("\nWorst Predictions:")
print(worst_predictions)
