In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel("Estimate_power2.xlsx")
data.head()

Unnamed: 0,hour,unit_no,fuel_type,temp,moisture,power,mvar
0,1,1,A,3,94,250,22
1,2,1,A,3,96,250,23
2,3,1,A,2,95,250,23
3,4,1,A,2,95,251,25
4,5,1,A,2,95,252,26


In [3]:
data=data[data["power"]>100]
data.shape

(999, 7)

In [4]:
# Change object to integer:
data["fuel_type"][data["fuel_type"]=="A"] = 1; 
data["fuel_type"][data["fuel_type"]=="B"] = 2;
data["fuel_type"][data["fuel_type"]=="C"] = 3;

In [5]:
data['fuel_type'].value_counts()

1    711
Name: fuel_type, dtype: int64

In [6]:
data["fuel_type"].fillna(method='bfill', inplace=True)

In [7]:
data.isnull().sum()

hour         0
unit_no      0
fuel_type    0
temp         0
moisture     0
power        0
mvar         0
dtype: int64

In [8]:
X = data[["moisture","unit_no","temp","fuel_type","power"]]
y = data[["mvar"]]
y

Unnamed: 0,mvar
0,22
1,23
2,23
3,25
4,26
...,...
994,15
995,15
996,14
997,16


In [9]:
X

Unnamed: 0,moisture,unit_no,temp,fuel_type,power
0,94,1,3,1,250
1,96,1,3,1,250
2,95,1,2,1,250
3,95,1,2,1,251
4,95,1,2,1,252
...,...,...,...,...,...
994,6,6,15,1,224
995,8,6,16,1,240
996,21,6,18,1,238
997,11,6,19,1,236


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [12]:
# Training the Decision Tree Regression model on the whole dataset
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred=rf.predict(X_test)

In [13]:
mse=mean_squared_error(y_pred,y_test)
r2=r2_score(y_pred,y_test)
print("Mean squared error:",mse)
print("R-squared:",r2)

Mean squared error: 19.127575229748864
R-squared: 0.9193483505875892


In [14]:
rf.fit(X_train_scaled, y_train)
y_pred=rf.predict(X_test_scaled)

In [15]:
mse=mean_squared_error(y_pred,y_test)
r2=r2_score(y_pred,y_test)
print("Mean squared error:",mse)
print("R-squared:",r2)

Mean squared error: 32.00879417940874
R-squared: 0.8636887711491308


# Tuning the hyperparameters

In [None]:
# Define the SVR model
rf = RandomForestRegressor(random_state = 0)

# Define the hyperparameters to tune
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
regressor = RandomForestRegressor(**grid_search.best_params_,random_state=42)
regressor.fit(X_train, y_train)
y_pred=regressor.predict(X_test)

In [None]:
mse=mean_squared_error(y_pred,y_test)
r2=r2_score(y_pred,y_test)
print("Mean squared error:",mse)
print("R-squared:",r2)

# Trying dimention reduction for the model

In [None]:
# Apply PCA to reduce the number of features to 3
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlim(0,14,1)
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

In [None]:
cumulative=np.cumsum(pca.explained_variance_ratio_)
plt.step([i for i in range(len(cumulative))],cumulative)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(X_train.corr(),vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True)
plt.tight_layout()
plt.show()

In [None]:
pca=PCA().fit(X_train)
print(pca.explained_variance_ratio_)
print()
print(X_train.columns.values.tolist())

In [None]:
pipeline = Pipeline([
    ('pca', PCA(n_components=3)),
    ('rf', RandomForestRegressor(bootstrap= True, max_depth= 80, max_features= 3, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 1000))
])

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [None]:
# Calculate Mean Squared Error (MSE) of the predictions 
mse=mean_squared_error(y_pred,y_test)
r2=r2_score(y_pred,y_test)
print("Mean squared error:",mse)
print("R-squared:",r2)