In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:


# Load the dataset
# data = {
#     'AT': [14.96, 25.18, 5.11, 20.86, 10.82, 26.27, 15.89, 9.48, 14.64, 11.74, 17.99, 20.14, 24.34, 25.71, 26.19, 21.42, 18.21, 11.04, 14.45, 13.97, 17.76, 5.41, 7.76, 27.23, 27.36, 27.47, 14.6, 7.91, 5.81, 30.53, 23.87],
#     'V': [41.76, 62.96, 39.4, 57.32, 37.5, 59.44, 43.96, 44.71, 45, 43.56, 43.72, 46.93, 73.5, 58.59, 69.34, 43.79, 45, 41.74, 52.75, 38.47, 42.42, 40.07, 42.28, 63.9, 48.6, 70.72, 39.31, 39.96, 35.79, 65.18, 63.94],
#     'AP': [1024.07, 1020.04, 1012.16, 1010.24, 1009.23, 1012.23, 1014.02, 1019.12, 1021.78, 1015.14, 1008.64, 1014.66, 1011.31, 1012.77, 1009.48, 1015.76, 1022.86, 1022.6, 1023.97, 1015.15, 1009.09, 1019.16, 1008.52, 1014.3, 1003.18, 1009.97, 1011.11, 1023.57, 1012.14, 1012.69, 1019.02],
#     'RH': [73.17, 59.08, 92.14, 76.64, 96.62, 58.77, 75.24, 66.43, 41.25, 70.72, 75.04, 64.22, 84.15, 61.83, 87.59, 43.08, 48.84, 77.51, 63.59, 55.28, 66.26, 64.77, 83.31, 47.19, 54.93, 74.62, 72.52, 88.44, 92.28, 41.85, 44.28],
#     'PE': [463.26, 444.37, 488.56, 446.48, 473.9, 443.67, 467.35, 478.42, 475.98, 477.5, 453.02, 453.99, 440.29, 451.28, 433.99, 462.19, 467.54, 477.2, 459.85, 464.3, 468.27, 495.24, 483.8, 443.61, 436.06, 443.25, 464.16, 475.52, 484.41, 437.89, 445.11]
# }

# df = pd.DataFrame(data)
df = pd.read_csv("./CCPP_data.csv")

# Define features and target
X = df[['AT', 'V', 'AP', 'RH']]
y = df['PE']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
# Model training and validation

# Linear Regression
lr = LinearRegression()
lr_cv_scores = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
lr_cv_rmse = -np.mean(lr_cv_scores)

# Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
rf_cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
rf_cv_rmse = -np.mean(rf_cv_scores)

print(f"Linear Regression CV RMSE: {lr_cv_rmse}")
print(f"Random Forest CV RMSE: {rf_cv_rmse}")


Linear Regression CV RMSE: 4.572397273712407
Random Forest CV RMSE: 3.4634835889157998


In [4]:
# Model selection and evaluation

# Fit the final model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {test_rmse}")


Test RMSE: 3.2432202566089683




In [7]:
print(X_test)
print(X_test.shape)

         AT      V       AP     RH
2513  19.64  48.06  1014.81  74.96
9411  28.26  69.23  1013.01  42.10
8745  27.98  67.17  1007.32  75.29
9085  28.64  69.23  1013.11  37.13
4950   9.34  38.08  1019.56  67.74
...     ...    ...      ...    ...
7204  29.06  64.96  1000.88  62.07
1599   9.87  40.81  1017.17  84.25
5697   8.02  39.04  1018.49  68.07
350   26.48  69.14  1009.31  84.11
6210  15.34  71.14  1019.79  77.56

[1914 rows x 4 columns]
(1914, 4)


In [13]:
single_data_set = [5.41,40.07,1019.16,64.77]      # The PE is 495.24 from dataset. lets see the predicted value  
# single_data_set = [7.76,42.28,1008.52,83.31]        # The PE is 483.8 from dataset. lets see the predicted value

single_data_set_reshape = np.array(single_data_set).reshape(1, -1)

rf.predict(single_data_set_reshape)



array([494.7362])