In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from sklearn.model_selection import KFold

In [2]:
def calculate_aic(y, x):
    model = sm.OLS(y, sm.add_constant(x)).fit()
    return model.aic

In [3]:
def perform_k_fold_cross_validation(data, target_variable, selected_variables, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    aic_values = []

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]

        aic = calculate_aic(train_data[target_variable], train_data[selected_variables])
        aic_values.append(aic)

    return np.mean(aic_values)

In [4]:
def stepwise_regression(data, target_variable, max_features=None, k=5):
    independent_variables = data.columns.drop(target_variable)
    best_aic = np.inf
    best_model = None
    selected_variables = []

    if max_features is None:
        max_features = len(independent_variables)

    while True:
        remaining_variables = list(set(independent_variables) - set(selected_variables))
        aic_candidates = []

        for feature in remaining_variables:
            selected_vars = selected_variables + [feature]
            if data[selected_vars].isnull().values.any(): # check if the selected variables contain any NaNs
                continue
            if np.isinf(data[selected_vars].values).any(): # check if the selected variables contain any infinite values
                continue
            
            aic = perform_k_fold_cross_validation(data, target_variable, selected_vars, k)
            aic_candidates.append((feature, aic))

        if not aic_candidates:
            break

        aic_candidates.sort(key=lambda x: x[1])
        best_candidate = aic_candidates[0]

        if best_candidate[1] < best_aic:
            selected_variables.append(best_candidate[0])
            best_aic = best_candidate[1]
            best_model = sm.OLS(data[target_variable], sm.add_constant(data[selected_variables])).fit()

            if len(selected_variables) == max_features:
                break
        else:
            break

    return best_model, selected_variables

In [5]:
# Example usage:
data = pd.read_csv('D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_Input/CDO.csv')  # Load your dataset
target_variable = 'CDO'  # Replace with your target column name
max_features = 6  # Set the maximum number of features to include in the model (optional)
k = 5  # Set the number of folds for cross-validation

In [6]:
best_model, selected_features = stepwise_regression(data, target_variable, max_features, k)
print("Selected features:", selected_features)
print(best_model.summary())

Selected features: ['Chl', 'Tur']
                            OLS Regression Results                            
Dep. Variable:                    CDO   R-squared:                       0.835
Model:                            OLS   Adj. R-squared:                  0.809
Method:                 Least Squares   F-statistic:                     32.79
Date:                Sat, 06 May 2023   Prob (F-statistic):           8.34e-06
Time:                        00:36:46   Log-Likelihood:                -23.906
No. Observations:                  16   AIC:                             53.81
Df Residuals:                      13   BIC:                             56.13
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         89.4



In [7]:
# Write the results to a file
with open('results_CDO_Bear.txt', 'w') as f:
    f.write(f"Selected features: {selected_features}\n")
    f.write(str(best_model.summary()))



In [8]:
# Get the coefficients and equation for the multiple linear regression model
coefficients = best_model.params
equation = "CDO = "
for i, feature in enumerate(selected_features):
    coefficient = round(coefficients[i+1], 4)
    equation += f"{coefficient}*{feature} + "
intercept = round(coefficients[0], 4)
equation += f"{intercept}"

In [9]:
# Display the coefficients and equation for the multiple linear regression model
print("Coefficients:", coefficients)
print("Equation:", equation)

Coefficients: const     89.442924
Chl     -233.180164
Tur      -17.163760
dtype: float64
Equation: CDO = -233.1802*Chl + -17.1638*Tur + 89.4429


In [10]:
# Make predictions using the best model
X = sm.add_constant(data[selected_features])
y_pred = best_model.predict(X)

In [11]:
# Add the predicted values to the original data
data['Predicted CDO'] = y_pred

In [12]:
# Display the original and predicted values of Turbidity
print("Original and Predicted CDO:")
print(data[[target_variable, 'Predicted CDO']])

Original and Predicted CDO:
      CDO  Predicted CDO
0   16.61      18.725540
1   24.25      24.294276
2   17.99      17.276016
3   18.91      18.515725
4   20.88      20.558627
5   21.09      19.880300
6   17.15      18.708119
7   21.30      19.392447
8   20.16      20.617426
9   23.43      22.665722
10  19.86      19.916692
11  18.10      19.396458
12  16.60      15.096447
13  17.59      18.084725
14  15.11      16.017924
15  14.99      14.873553


In [13]:
# Save the original and predicted turbidity values to an Excel file
output_data = data[[target_variable, 'Predicted CDO']]
output_file_path = 'D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_output/CDO_Khdakwasala_2.xlsx'
output_data.to_excel(output_file_path, index=False)