# Model training:

Simple Train-Test Split (80-20%)  GA with k-fold (k=5).

MAE

In [1]:
import pandas as pd
import numpy as np
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import pickle

# Load your dataset
df = pd.read_excel('Dataset_11_features.xlsx')

# Prepare data
X = df.drop(['NAM', 'REP', 'VAR', 'OLA', 'CT', 'CWC', 'WS', 'GW', 'PP', 'GP', 'GY', 'RS'], axis='columns')
Y = df['OLA']

# Standardize the features
stdScale = StandardScaler().fit(X)
X = stdScale.transform(X)

# Split dataset into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize TPOTRegressor for hyperparameter optimization
tpot = TPOTRegressor(
    generations=10,  # Number of iterations
    population_size=20,  # Number of models per generation
    verbosity=2,
    scoring='neg_mean_absolute_error',
    random_state=42,
    cv=5  # 5-fold cross-validation
)

# Fit TPOT to the training data
tpot.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred_train = tpot.fitted_pipeline_.predict(X_train)
y_pred_test = tpot.fitted_pipeline_.predict(X_test)
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)

# Print results
print("\nBest Model Details:")
print(f"Best Model: {tpot.fitted_pipeline_}")
print(f"Best Hyperparameters: {tpot.fitted_pipeline_.get_params()}")
print(f"Training MAE: {train_mae}")
print(f"Test MAE: {test_mae}")

# Save the best model as a pickle file
with open('best_model3.pkl', 'wb') as f:
    pickle.dump(tpot.fitted_pipeline_, f)

print("\nOptimization completed.")
print("Best model saved to 'best_model3.pkl'.")


  import pkg_resources


is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor




is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor
is_classifier
is_regressor


Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.17254814224717033

Generation 2 - Current best internal CV score: -0.1716133702600194

Generation 3 - Current best internal CV score: -0.15770900961524725

Generation 4 - Current best internal CV score: -0.15468028234920755

Generation 5 - Current best internal CV score: -0.15468028234920755

Generation 6 - Current best internal CV score: -0.1541718265524599

Generation 7 - Current best internal CV score: -0.1541718265524599

Generation 8 - Current best internal CV score: -0.1541718265524599

Generation 9 - Current best internal CV score: -0.1541718265524599

Generation 10 - Current best internal CV score: -0.1541718265524599

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.8, min_samples_leaf=3, min_samples_split=3, n_estimators=100)

Best Model Details:
Best Model: Pipeline(steps=[('randomforestregressor',
                 RandomForestRegressor(max_features=0.8, min_samples_leaf=3,
                  

# Interpretation of R² Score:

R² ≈ 1 → Model predicts perfectly.

R² > 0.8 → Strong prediction.

R² ≈ 0.5-0.8 → Moderate prediction.

R² < 0.5 → Weak prediction.

R² < 0 → Model is worse than a simple average!

In [2]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Load the saved best model
with open('best_model3.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Load the dataset
df = pd.read_excel('Dataset_11_features.xlsx')

# Prepare data (same preprocessing as before)
X = df.drop(['NAM', 'REP', 'VAR', 'OLA', 'CT', 'CWC', 'WS', 'GW', 'PP', 'GP', 'GY', 'RS'], axis='columns')
Y = df['OLA']

# Standardize the input features (same as in training)
stdScale = StandardScaler().fit(X)
X = stdScale.transform(X)

# Make predictions using the best model
Y_pred = best_model.predict(X)

# Calculate R² score
r2 = r2_score(Y, Y_pred)
print(f"R² Score of the Best Model: {r2:.4f}")

R² Score of the Best Model: 0.9727


# RMSE

In [4]:
import warnings
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_excel('Dataset_11_features.xlsx')

# Prepare data 
X = df.drop(['NAM', 'REP', 'VAR', 'OLA', 'CT', 'CWC', 'WS', 'GW', 'PP', 'GP', 'GY', 'RS'], axis='columns')
Y = df['OLA']

# Standardize the input features 
stdScale = StandardScaler().fit(X)
X_scaled = stdScale.transform(X)

# Suppress sklearn/version warnings 
with warnings.catch_warnings():
    warnings.simplefilter("ignore") 


    # Load the saved best model 
    with open('best_model3.pkl', 'rb') as f:
        best_model = pickle.load(f)

    # Make predictions
    Y_pred = best_model.predict(X_scaled)

# Compute RMSE 
mse = mean_squared_error(Y, Y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")


RMSE: 0.1589
