## `Combined Cycle Power Plant` 

### Importing libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
    )
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor


import pandas as pd
import numpy as np
import time

### Preprocessing

In [2]:
# Load the combined cycle power plant dataset
ccpp = pd.read_csv('../datasets/original/ccpp.csv')

# Split the dataset into features and target variable
X = ccpp.drop('PE', axis=1)
y = ccpp['PE']

# Split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Normalize the data
scaler = MinMaxScaler()

variables_to_normalize = ['AT', 'V', 'AP', 'RH']

# Fit transform varirables AT, V, AP, RH
X_train[variables_to_normalize] = scaler.fit_transform(X_train[variables_to_normalize]) 
X_test[variables_to_normalize] = scaler.transform(X_test[variables_to_normalize])

X_train
 

Unnamed: 0,AT,V,AP,RH
2058,0.464589,0.289493,0.005444,0.492425
8042,0.839377,0.712879,0.523385,0.316262
7612,0.314448,0.297337,0.506805,0.616168
1166,0.386969,0.385261,0.366246,0.878268
3597,0.838244,0.751186,0.263796,0.431291
...,...,...,...,...
5734,0.692351,0.650493,0.459787,0.612146
5191,0.775071,0.719263,0.582529,0.475801
5390,0.458640,0.334549,0.557040,0.478885
860,0.705949,0.793141,0.441970,0.553291


## Tests

## Fining optimal K

In [3]:
# Optimal K with cross-validation cross_val_score
k_values = list(range(1, 30))
cross_val_scores = []

for k in k_values:
    knn = KNeighborsRegressor(n_neighbors=k, weights='distance')
    cross_val_scores.append(cross_val_score(knn, X_train, y_train, cv=10).mean())
    
# optimal_k = k_values[np.argmax(cross_val_scores)]


# Get the index of the top 3 scores
top_3_scores = np.argsort(cross_val_scores)[::-1][:3]

# Get the k values for the top 3 scores
top_3_k_values = [k_values[i] for i in top_3_scores]

top_3_k_values

[8, 9, 7]

## Attributes importance

In [4]:
# Train a Random Forest model to get which Features are more important
model = RandomForestRegressor(max_leaf_nodes=2,
                        max_features=5,
                        max_depth=5,
                        random_state=42)

# Fit the model to the data
model.fit(X_train, y_train)

# Get the most important Features
most_important_attributes = pd.DataFrame(
                            model.feature_importances_,
                            index = X_train.columns,
                            columns=['importance']
                        ).sort_values('importance', ascending=False)


most_important_attributes


Unnamed: 0,importance
AT,1.0
V,0.0
AP,0.0
RH,0.0


## KNN - sklearn

In [5]:
knn = KNeighborsRegressor(n_neighbors=6, weights='uniform')
knn.fit(X_train, y_train)

runtime_start = time.process_time()
runtime_end = time.process_time()

y_pred = knn.predict(X_test)
runtime = runtime_end - runtime_start

# Calculate the metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2, runtime

(2.715487040133779,
 13.661394512959868,
 0.9525809558491174,
 2.4746000000241963e-05)

## Gradient Descent - sklearn

In [6]:
gd = SGDRegressor()

start = time.process_time()
gd.fit(X_train, y_train)
gd_pred = gd.predict(X_test)
end = time.process_time()

gd_runtime = end - start

gd_mae = mean_absolute_error(y_test, gd_pred)
gd_mse = mean_squared_error(y_test, gd_pred)
gd_r2 = r2_score(y_test, gd_pred)

gd_mae, gd_mse, gd_r2, gd_runtime

(3.6079188417051173,
 20.193003701335524,
 0.9299095759847794,
 0.1757067999999995)

## Decision Tree Regression - sklearn

In [7]:
dtr = DecisionTreeRegressor()

start = time.process_time()
dtr.fit(X_train, y_train)
dtr_pred = dtr.predict(X_test)
end = time.process_time()

dtr_runtime = end - start

dtr_mae = mean_absolute_error(y_test, dtr_pred)
dtr_mse = mean_squared_error(y_test, dtr_pred)
dtr_r2 = r2_score(y_test, dtr_pred)

dtr_mae, dtr_mse, dtr_r2, dtr_runtime

(3.057027591973244, 20.988553219063537, 0.9271482035883106, 0.9549089090000002)