In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [28]:
data = pd.read_excel("estimate_power_consumption.xlsx")
data.head()

Unnamed: 0,unit_no,maxa,maxr,active,reactive,consumption
0,1,1322,1043,26815.0,568.0,264.452105
1,2,1316,1032,24773.0,253.0,39.563698
2,4,1345,1005,26820.0,112.0,191.193007
3,6,1336,1013,25470.0,150.0,185.514004
4,1,1320,1038,26875.0,636.0,216.559192


In [29]:
data.isnull().sum()

unit_no        0
maxa           0
maxr           0
active         0
reactive       0
consumption    0
dtype: int64

In [30]:
X = data[["unit_no","maxa","maxr","active","reactive"]]
y = data[["consumption"]]
y

Unnamed: 0,consumption
0,264.452105
1,39.563698
2,191.193007
3,185.514004
4,216.559192
...,...
994,2260.499827
995,2399.545032
996,165.448196
997,174.534605


In [31]:
X

Unnamed: 0,unit_no,maxa,maxr,active,reactive
0,1,1322,1043,26815.000000,568.000000
1,2,1316,1032,24773.000000,253.000000
2,4,1345,1005,26820.000000,112.000000
3,6,1336,1013,25470.000000,150.000000
4,1,1320,1038,26875.000000,636.000000
...,...,...,...,...,...
994,8,1340,1007,27237.512695,125.000000
995,9,1347,1007,27490.889893,145.016361
996,1,1314,1035,26708.000000,262.000000
997,2,1309,1022,26580.000000,232.000000


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [34]:
# Define Gradient Boosting Regression model
gb_model = GradientBoostingRegressor()

In [35]:
# Define hyperparameters to tune
params = {
'learning_rate': [0.01, 0.1, 1],
'max_depth': [3, 5, 7],
'n_estimators': [50, 100, 200]
}

In [36]:
# Perform Grid Search Cross Validation
grid_search = GridSearchCV(gb_model, params, cv=5)
grid_search.fit(X_train_scaled, y_train)

In [37]:
# Print the best hyperparameters 
print(grid_search.best_params_)

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}


In [38]:
gb_model = GradientBoostingRegressor(n_estimators=200,learning_rate=0.1,max_depth=5,random_state=42)
gb_model.fit(X_train_scaled,y_train)
y_pred=gb_model.predict(X_test_scaled)

In [39]:
mse=mean_squared_error(y_pred,y_test)
r2=r2_score(y_pred,y_test)
print("Mean squared error:",mse)
print("R-squared:",r2)

Mean squared error: 20623.268419224412
R-squared: 0.9740684787133534


In [40]:
y_pred = gb_model.predict([[45,1,30,1,220]])
print(y_pred[0])

2265.7075675124565


In [None]:
# Define Gradient Boosting Regression model
gb_model2 = GradientBoostingRegressor()
# Perform Grid Search Cross Validation
grid_search = GridSearchCV(gb_model2, params, cv=5)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

In [None]:
gb_model2 = GradientBoostingRegressor(n_estimators=50,learning_rate=0.1,max_depth=5,random_state=42)
gb_model2.fit(X_train,y_train)
y_pred=gb_model2.predict(X_test)

In [None]:
mse=mean_squared_error(y_pred,y_test)
r2=r2_score(y_pred,y_test)
print("Mean squared error:",mse)
print("R-squared:",r2)

In [None]:
y_pred = gb_model2.predict([[45,1,30,1,220]])
print(y_pred[0])

# Trying dimention reduction

In [None]:
# Apply PCA to reduce the number of features to 3
pca = PCA(n_components=X.shape[1])
X_train_pca = pca.fit_transform(X_train)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,14,1)
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

In [None]:
cumulative=np.cumsum(pca.explained_variance_ratio_)
plt.step([i for i in range(len(cumulative))],cumulative)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(X_train.corr(),vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True)
plt.tight_layout()
plt.show()

In [None]:
pca=PCA().fit(X_train)
print(pca.explained_variance_ratio_)
print()
print(X_train.columns.values.tolist())

In [None]:
pipeline = Pipeline([
    ('pca', PCA(n_components=X.shape[1])),
    ('gbr', GradientBoostingRegressor(n_estimators=50,learning_rate=0.1,max_depth=5,random_state=42))
])

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [None]:
# Calculate Mean Squared Error (MSE) of the predictions 
mse=mean_squared_error(y_pred,y_test)
r2=r2_score(y_pred,y_test)
print("Mean squared error:",mse)
print("R-squared:",r2)