In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from sklearn.model_selection import KFold

In [8]:
def calculate_aic(y, x):
    model = sm.OLS(y, sm.add_constant(x)).fit()
    return model.aic

In [9]:
def perform_k_fold_cross_validation(data, target_variable, selected_variables, k=10):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    aic_values = []

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]

        aic = calculate_aic(train_data[target_variable], train_data[selected_variables])
        aic_values.append(aic)

    return np.mean(aic_values)

In [10]:
def stepwise_regression(data, target_variable, max_features=None, k=2):
    independent_variables = data.columns.drop(target_variable)
    best_aic = np.inf
    best_model = None
    selected_variables = []

    if max_features is None:
        max_features = len(independent_variables)

    while True:
        remaining_variables = list(set(independent_variables) - set(selected_variables))
        aic_candidates = []

        for feature in remaining_variables:
            selected_vars = selected_variables + [feature]
            if data[selected_vars].isnull().values.any(): # check if the selected variables contain any NaNs
                continue
            if np.isinf(data[selected_vars].values).any(): # check if the selected variables contain any infinite values
                continue
            
            aic = perform_k_fold_cross_validation(data, target_variable, selected_vars, k)
            aic_candidates.append((feature, aic))

        if not aic_candidates:
            break

        aic_candidates.sort(key=lambda x: x[1])
        best_candidate = aic_candidates[0]

        if best_candidate[1] < best_aic:
            selected_variables.append(best_candidate[0])
            best_aic = best_candidate[1]
            best_model = sm.OLS(data[target_variable], sm.add_constant(data[selected_variables])).fit()

            if len(selected_variables) == max_features:
                break
        else:
            break

    return best_model, selected_variables

In [11]:
# Example usage:
data = pd.read_csv('D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_Input/Turbidity.csv')  # Load your dataset
target_variable = 'Turbidity'  # Replace with your target column name
max_features = 6  # Set the maximum number of features to include in the model (optional)
k = 5  # Set the number of folds for cross-validation

In [12]:
best_model, selected_features = stepwise_regression(data, target_variable, max_features, k)
print("Selected features:", selected_features)
print(best_model.summary())

Selected features: ['B2/B1']
                            OLS Regression Results                            
Dep. Variable:              Turbidity   R-squared:                       0.844
Model:                            OLS   Adj. R-squared:                  0.833
Method:                 Least Squares   F-statistic:                     75.70
Date:                Sat, 06 May 2023   Prob (F-statistic):           5.09e-07
Time:                        16:10:21   Log-Likelihood:                 27.235
No. Observations:                  16   AIC:                            -50.47
Df Residuals:                      14   BIC:                            -48.92
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0216  



In [7]:
# Write the results to a file
with open('results.txt', 'w') as f:
    f.write(f"Selected features: {selected_features}\n")
    f.write(str(best_model.summary()))



In [8]:
# Get the coefficients and equation for the multiple linear regression model
coefficients = best_model.params
equation = "Turbidity = "
for i, feature in enumerate(selected_features):
    coefficient = round(coefficients[i+1], 4)
    equation += f"{coefficient}*{feature} + "
intercept = round(coefficients[0], 4)
equation += f"{intercept}"

In [9]:
# Display the coefficients and equation for the multiple linear regression model
print("Coefficients:", coefficients)
print("Equation:", equation)

Coefficients: const    1.021024
B2/B1    1.012028
dtype: float64
Equation: Turbidity = 1.012*B2/B1 + 1.021


In [10]:
# Make predictions using the best model
X = sm.add_constant(data[selected_features])
y_pred = best_model.predict(X)

In [11]:
# Add the predicted values to the original data
data['Predicted Turbidity'] = y_pred

In [12]:
# Display the original and predicted values of Turbidity
print("Original and Predicted Turbidity:")
print(data[[target_variable, 'Predicted Turbidity']])

Original and Predicted Turbidity:
    Turbidity  Predicted Turbidity
0    2.034548             2.050912
1    1.868908             1.883038
2    1.822511             1.812870
3    2.102760             2.146012
4    2.055466             2.069104
5    2.069427             2.110774
6    1.911519             1.916021
7    2.087713             2.118815
8    2.039185             2.093354
9    1.856125             1.851286
10   2.060553             2.099203
11   1.993036             2.016327
12   2.071995             2.046565
13   2.147920             2.089051
14   2.207605             2.110120
15   2.190728             2.106547


In [13]:
# Save the original and predicted turbidity values to an Excel file
output_data = data[[target_variable, 'Predicted Turbidity']]
output_file_path = 'D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_Input/turbidity_values_4.xlsx'
output_data.to_excel(output_file_path, index=False)