<a href="https://colab.research.google.com/github/shatlykgurdov/3.1.2/blob/main/3a.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Problem 3a – Model Selection for Y ~ Z1, Z2, Z3, Z4, Z5
# Group 12282 (even) → FE-GWP1_model_selection_2.csv

import pandas as pd
import numpy as np
import itertools
import statsmodels.api as sm

# =========================
# 1. Load the dataset
# =========================
df = pd.read_csv("FE-GWP1_model_selection_2.csv")

# Clean column names: remove leading/trailing spaces
df.columns = df.columns.str.strip()

print("First 5 rows of dataset:")
print(df.head())
print("\nColumns in dataset:", df.columns.tolist())

# Now columns should be: 'Y', 'Z1', 'Z2', 'Z3', 'Z4', 'Z5'
y = df["Y"]
X_all = df[["Z1", "Z2", "Z3", "Z4", "Z5"]]
candidate_predictors = list(X_all.columns)

# Helper function to fit OLS model with given predictors
def fit_model(predictors):
    X = sm.add_constant(X_all[list(predictors)])
    model = sm.OLS(y, X).fit()
    return model

# =========================
# 2. Approach 1: All-subsets model selection
#    (Adjusted R², AIC, BIC)
# =========================
results = []

for k in range(1, len(candidate_predictors) + 1):
    for subset in itertools.combinations(candidate_predictors, k):
        model = fit_model(subset)
        results.append({
            "predictors": subset,
            "adj_R2": model.rsquared_adj,
            "AIC": model.aic,
            "BIC": model.bic
        })

results_df = pd.DataFrame(results)

# Top models by each criterion
best_by_adjR2 = results_df.sort_values("adj_R2", ascending=False).head(5)
best_by_AIC   = results_df.sort_values("AIC", ascending=True).head(5)
best_by_BIC   = results_df.sort_values("BIC", ascending=True).head(5)

print("\n=== Top 5 models by Adjusted R² ===")
print(best_by_adjR2)

print("\n=== Top 5 models by AIC ===")
print(best_by_AIC)

print("\n=== Top 5 models by BIC ===")
print(best_by_BIC)

# Choose one "best" model by BIC
best_model_row = best_by_BIC.iloc[0]
best_predictors = list(best_model_row["predictors"])
print("\n>>> Chosen best model by BIC has predictors:", best_predictors)

best_model = fit_model(best_predictors)
print("\n=== Summary of best model by BIC ===")
print(best_model.summary())

# =========================
# 3. Approach 2: Forward stepwise selection (AIC)
# =========================
def forward_stepwise_selection(X, y, verbose=True):
    remaining = list(X.columns)
    selected = []
    current_score = np.inf
    best_new_score = np.inf

    while remaining:
        scores_with_candidates = []

        for candidate in remaining:
            predictors = selected + [candidate]
            X_candidate = sm.add_constant(X[predictors])
            model = sm.OLS(y, X_candidate).fit()
            scores_with_candidates.append((model.aic, candidate))

        # Pick candidate with lowest AIC
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates[0]

        if verbose:
            print(f"Trying to add {best_candidate}: AIC = {best_new_score:.3f}")

        if best_new_score < current_score - 1e-6:  # improvement
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
            if verbose:
                print(f"  -> Added {best_candidate}, new AIC = {current_score:.3f}")
        else:
            if verbose:
                print("No further AIC improvement. Stopping forward selection.")
            break

    return selected, current_score

print("\n=== Forward Stepwise Selection (AIC) ===")
forward_predictors, forward_aic = forward_stepwise_selection(X_all, y, verbose=True)
print("\n>>> Forward stepwise selected predictors:", forward_predictors)
print("Final AIC from forward selection:", forward_aic)

X_forward = sm.add_constant(X_all[forward_predictors])
forward_model = sm.OLS(y, X_forward).fit()
print("\n=== Summary of forward stepwise model ===")
print(forward_model.summary())

# =========================
# 4. Approach 3: Backward elimination (BIC)
# =========================
def backward_elimination(X, y, verbose=True, criterion="BIC"):
    predictors = list(X.columns)

    # Start with full model
    X_full = sm.add_constant(X[predictors])
    model_full = sm.OLS(y, X_full).fit()
    if criterion == "AIC":
        best_score = model_full.aic
    elif criterion == "BIC":
        best_score = model_full.bic
    else:
        raise ValueError("criterion must be 'AIC' or 'BIC'")

    if verbose:
        print(f"\nStart backward elimination with full model {predictors}, {criterion} = {best_score:.3f}")

    improved = True
    while improved and len(predictors) > 1:
        scores_with_candidates = []

        for candidate in predictors:
            trial_predictors = [p for p in predictors if p != candidate]
            X_trial = sm.add_constant(X[trial_predictors])
            trial_model = sm.OLS(y, X_trial).fit()
            score = trial_model.aic if criterion == "AIC" else trial_model.bic
            scores_with_candidates.append((score, candidate, trial_predictors))

        scores_with_candidates.sort()
        best_new_score, worst_predictor, best_predictor_set = scores_with_candidates[0]

        if verbose:
            print(f"Trying to remove {worst_predictor}: {criterion} = {best_new_score:.3f}")

        if best_new_score < best_score - 1e-6:
            predictors = best_predictor_set
            best_score = best_new_score
            if verbose:
                print(f"  -> Removed {worst_predictor}, new {criterion} = {best_score:.3f}")
        else:
            if verbose:
                print("No further improvement. Stopping backward elimination.")
            improved = False

    final_X = sm.add_constant(X[predictors])
    final_model = sm.OLS(y, final_X).fit()
    return predictors, best_score, final_model

print("\n=== Backward Elimination (BIC) ===")
backward_predictors, backward_score, backward_model = backward_elimination(X_all, y, verbose=True, criterion="BIC")
print("\n>>> Backward elimination (BIC) selected predictors:", backward_predictors)
print("Final BIC from backward elimination:", backward_score)
print("\n=== Summary of backward elimination model ===")
print(backward_model.summary())

# =========================
# 5. Final comparison info
# =========================
print("\n================ FINAL SUMMARY ================")
print("Best model by BIC (all-subsets) predictors:", best_predictors)
print("Forward stepwise selected predictors:", forward_predictors)
print("Backward elimination (BIC) selected predictors:", backward_predictors)
print("Use these results to justify your final chosen model in the report.")
