In [40]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from sklearn.model_selection import KFold

In [41]:
def calculate_aic(y, x):
    model = sm.OLS(y, sm.add_constant(x)).fit()
    return model.aic

In [42]:
def perform_k_fold_cross_validation(data, target_variable, selected_variables, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    aic_values = []

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]

        aic = calculate_aic(train_data[target_variable], train_data[selected_variables])
        aic_values.append(aic)

    return np.mean(aic_values)

In [43]:
def stepwise_regression(data, target_variable, max_features=None, k=5):
    independent_variables = data.columns.drop(target_variable)
    best_aic = np.inf
    best_model = None
    selected_variables = []

    if max_features is None:
        max_features = len(independent_variables)

    while True:
        remaining_variables = list(set(independent_variables) - set(selected_variables))
        aic_candidates = []

        for feature in remaining_variables:
            selected_vars = selected_variables + [feature]
            if data[selected_vars].isnull().values.any(): # check if the selected variables contain any NaNs
                continue
            if np.isinf(data[selected_vars].values).any(): # check if the selected variables contain any infinite values
                continue
            
            aic = perform_k_fold_cross_validation(data, target_variable, selected_vars, k)
            aic_candidates.append((feature, aic))

        if not aic_candidates:
            break

        aic_candidates.sort(key=lambda x: x[1])
        best_candidate = aic_candidates[0]

        if best_candidate[1] < best_aic:
            selected_variables.append(best_candidate[0])
            best_aic = best_candidate[1]
            best_model = sm.OLS(data[target_variable], sm.add_constant(data[selected_variables])).fit()

            if len(selected_variables) == max_features:
                break
        else:
            break

    return best_model, selected_variables

In [44]:
# Example usage:
data = pd.read_csv('D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_Input/DO.csv')  # Load your dataset
target_variable = 'DO'  # Replace with your target column name
max_features = 6  # Set the maximum number of features to include in the model (optional)
k = 5  # Set the number of folds for cross-validation

In [45]:
best_model, selected_features = stepwise_regression(data, target_variable, max_features, k)
print("Selected features:", selected_features)
print(best_model.summary())

Selected features: ['Tur']
                            OLS Regression Results                            
Dep. Variable:                     DO   R-squared:                       0.649
Model:                            OLS   Adj. R-squared:                  0.624
Method:                 Least Squares   F-statistic:                     25.89
Date:                Sat, 06 May 2023   Prob (F-statistic):           0.000165
Time:                        00:11:47   Log-Likelihood:                 24.586
No. Observations:                  16   AIC:                            -45.17
Df Residuals:                      14   BIC:                            -43.63
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.9572    



In [46]:
# Write the results to a file
with open('results_DO_Bear.txt', 'w') as f:
    f.write(f"Selected features: {selected_features}\n")
    f.write(str(best_model.summary()))



In [47]:
# Get the coefficients and equation for the multiple linear regression model
coefficients = best_model.params
equation = "DO = "
for i, feature in enumerate(selected_features):
    coefficient = round(coefficients[i+1], 4)
    equation += f"{coefficient}*{feature} + "
intercept = round(coefficients[0], 4)
equation += f"{intercept}"

In [48]:
# Display the coefficients and equation for the multiple linear regression model
print("Coefficients:", coefficients)
print("Equation:", equation)

Coefficients: const    8.957238
Tur     -0.633942
dtype: float64
Equation: DO = -0.6339*Tur + 8.9572


In [49]:
# Make predictions using the best model
X = sm.add_constant(data[selected_features])
y_pred = best_model.predict(X)

In [50]:
# Add the predicted values to the original data
data['Predicted DO'] = y_pred

In [51]:
# Display the original and predicted values of Turbidity
print("Original and Predicted DO:")
print(data[[target_variable, 'Predicted DO']])

Original and Predicted DO:
          DO  Predicted DO
0   7.684020      7.667451
1   7.763401      7.772458
2   7.839137      7.801871
3   7.696183      7.624209
4   7.672417      7.654191
5   7.652173      7.645340
6   7.727264      7.745445
7   7.704530      7.633748
8   7.668948      7.664512
9   7.670491      7.780561
10  7.594595      7.650966
11  7.757047      7.693768
12  7.675749      7.643713
13  7.608633      7.595580
14  7.475526      7.557744
15  7.509887      7.568442


In [52]:
# Save the original and predicted turbidity values to an Excel file
output_data = data[[target_variable, 'Predicted DO']]
output_file_path = 'D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_output/DO_Khdakwasala_4.xlsx'
output_data.to_excel(output_file_path, index=False)