In [7]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from sklearn.model_selection import KFold

In [8]:
def calculate_aic(y, x):
    model = sm.OLS(y, sm.add_constant(x)).fit()
    return model.aic

In [9]:
def perform_k_fold_cross_validation(data, target_variable, selected_variables, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    aic_values = []

    for train_index, test_index in kf.split(data):
        train_data = data.iloc[train_index]
        test_data = data.iloc[test_index]

        aic = calculate_aic(train_data[target_variable], train_data[selected_variables])
        aic_values.append(aic)

    return np.mean(aic_values)

In [10]:
def stepwise_regression(data, target_variable, max_features=None, k=5):
    independent_variables = data.columns.drop(target_variable)
    best_aic = np.inf
    best_model = None
    selected_variables = []

    if max_features is None:
        max_features = len(independent_variables)

    while True:
        remaining_variables = list(set(independent_variables) - set(selected_variables))
        aic_candidates = []

        for feature in remaining_variables:
            selected_vars = selected_variables + [feature]
            if data[selected_vars].isnull().values.any(): # check if the selected variables contain any NaNs
                continue
            if np.isinf(data[selected_vars].values).any(): # check if the selected variables contain any infinite values
                continue
            
            aic = perform_k_fold_cross_validation(data, target_variable, selected_vars, k)
            aic_candidates.append((feature, aic))

        if not aic_candidates:
            break

        aic_candidates.sort(key=lambda x: x[1])
        best_candidate = aic_candidates[0]

        if best_candidate[1] < best_aic:
            selected_variables.append(best_candidate[0])
            best_aic = best_candidate[1]
            best_model = sm.OLS(data[target_variable], sm.add_constant(data[selected_variables])).fit()

            if len(selected_variables) == max_features:
                break
        else:
            break

    return best_model, selected_variables

In [11]:
# Example usage:
data = pd.read_csv('D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_Input/BDO.csv')  # Load your dataset
target_variable = 'BDO'  # Replace with your target column name
max_features = 6  # Set the maximum number of features to include in the model (optional)
k = 5  # Set the number of folds for cross-validation

In [12]:
best_model, selected_features = stepwise_regression(data, target_variable, max_features, k)
print("Selected features:", selected_features)
print(best_model.summary())

Selected features: ['Tur']
                            OLS Regression Results                            
Dep. Variable:                    BDO   R-squared:                       0.644
Model:                            OLS   Adj. R-squared:                  0.618
Method:                 Least Squares   F-statistic:                     25.30
Date:                Sat, 06 May 2023   Prob (F-statistic):           0.000184
Time:                        00:49:29   Log-Likelihood:                 19.354
No. Observations:                  16   AIC:                            -34.71
Df Residuals:                      14   BIC:                            -33.16
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.6707    



In [13]:
# Write the results to a file
with open('results_BDO_Bear.txt', 'w') as f:
    f.write(f"Selected features: {selected_features}\n")
    f.write(str(best_model.summary()))



In [14]:
# Get the coefficients and equation for the multiple linear regression model
coefficients = best_model.params
equation = "BDO = "
for i, feature in enumerate(selected_features):
    coefficient = round(coefficients[i+1], 4)
    equation += f"{coefficient}*{feature} + "
intercept = round(coefficients[0], 4)
equation += f"{intercept}"

In [15]:
# Display the coefficients and equation for the multiple linear regression model
print("Coefficients:", coefficients)
print("Equation:", equation)

Coefficients: const    5.670715
Tur     -0.869048
dtype: float64
Equation: BDO = -0.869*Tur + 5.6707


In [16]:
# Make predictions using the best model
X = sm.add_constant(data[selected_features])
y_pred = best_model.predict(X)

In [17]:
# Add the predicted values to the original data
data['Predicted BDO'] = y_pred

In [18]:
# Display the original and predicted values of Turbidity
print("Original and Predicted BDO:")
print(data[[target_variable, 'Predicted BDO']])

Original and Predicted BDO:
     BDO  Predicted BDO
0   3.97       3.902595
1   3.93       4.046544
2   4.04       4.086865
3   3.84       3.843315
4   3.92       3.884417
5   3.83       3.872283
6   3.93       4.009513
7   3.79       3.856392
8   3.97       3.898566
9   4.25       4.057653
10  3.83       3.879996
11  3.97       3.938671
12  3.87       3.870052
13  3.80       3.804070
14  3.80       3.752200
15  3.73       3.766867


In [19]:
# Save the original and predicted turbidity values to an Excel file
output_data = data[[target_variable, 'Predicted BDO']]
output_file_path = 'D:/Mid_review_presentation/Thesis_writing/Chapters/Chapter_4/Khadakwasala/Training_output/BDO_Khdakwasala_4.xlsx'
output_data.to_excel(output_file_path, index=False)