# Variable selection methods

## Why use variable selection methods?
Selecting the right set of predictor variables is crucial in MLR.
- Too many variables → overfitting.
- Too few variables → underfitting.

In [2]:
# Create a small sample CSV (sample_data.csv)
import numpy as np
import pandas as pd
np.random.seed(42)
n = 200
X1 = np.random.normal(size=n)
X2 = 0.8 * X1 + np.random.normal(scale=0.2, size=n)  # correlated with X1
X3 = np.random.normal(size=n)
X4 = np.random.normal(size=n)
X5 = np.random.normal(size=n)
# Helpful target: y depends on X1 and X3
y = 3.0 * X1 - 2.0 * X3 + 0.5 * X5 + np.random.normal(scale=1.0, size=n)
df = pd.DataFrame({
    'X1': X1, 'X2': X2, 'X3': X3, 'X4': X4, 'X5': X5, 'y': y
})
fn = 'sample_data.csv'
df.to_csv(fn, index=False)
print(f'Wrote {fn} with shape', df.shape)

Wrote sample_data.csv with shape (200, 6)


In [3]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import Lasso, Ridge, ElasticNet
import itertools

df = pd.read_csv('sample_data.csv')
X = df.drop(columns=['y'])
y = df['y']
X.head()

Unnamed: 0,X1,X2,X3,X4,X5
0,0.496714,0.468929,-1.594428,0.756989,0.938284
1,-0.138264,0.001545,-0.599375,-0.922165,-0.516045
2,0.647689,0.734761,0.005244,0.869606,0.096121
3,1.52303,1.429184,0.046981,1.355638,-0.462275
4,-0.234153,-0.462857,-0.450065,0.413435,-0.434496


### Forward Selection (based on AIC)

In [4]:
def forward_selection(X, y):
    remaining = list(X.columns)
    selected = []
    current_score = float('inf')

    while remaining:
        scores = []
        for candidate in remaining:
            model = sm.OLS(y, sm.add_constant(X[selected + [candidate]])).fit()
            scores.append((model.aic, candidate))
        scores.sort()
        best_new_score, best_candidate = scores[0]
        if best_new_score < current_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
        else:
            break
    return selected

print('Forward selection result:', forward_selection(X, y))

Forward selection result: ['X1', 'X3', 'X5']


### Backward Elimination (p-value based)

In [5]:
def backward_elimination(X, y, alpha=0.05):
    features = list(X.columns)
    while len(features) > 0:
        model = sm.OLS(y, sm.add_constant(X[features])).fit()
        pvalues = model.pvalues.iloc[1:]
        worst = pvalues.idxmax()
        if pvalues[worst] > alpha:
            features.remove(worst)
        else:
            break
    return features

print('Backward elimination result:', backward_elimination(X, y))

Backward elimination result: ['X1', 'X3', 'X5']


### Stepwise Selection (combines forward/backward; AIC + p-values)

In [6]:
def stepwise_selection(X, y, alpha_out=0.05):
    selected = []
    remaining = list(X.columns)
    current_score = float('inf')
    while True:
        changed = False
        # forward step — try adding each candidate
        forward_scores = {}
        for candidate in remaining:
            model = sm.OLS(y, sm.add_constant(X[selected + [candidate]])).fit()
            forward_scores[candidate] = model.aic
        if forward_scores:
            best_candidate = min(forward_scores, key=forward_scores.get)
            if forward_scores[best_candidate] < current_score:
                selected.append(best_candidate)
                remaining.remove(best_candidate)
                current_score = forward_scores[best_candidate]
                changed = True
        # backward step — remove any variables with high p-value
        if selected:
            model = sm.OLS(y, sm.add_constant(X[selected])).fit()
            pvalues = model.pvalues.iloc[1:]
            worst = pvalues.idxmax()
            if pvalues[worst] > alpha_out:
                selected.remove(worst)
                remaining.append(worst)
                changed = True
        if not changed:
            break
    return selected

print('Stepwise selection result:', stepwise_selection(X, y))

Stepwise selection result: ['X1', 'X3', 'X5']


### Best Subset Selection (try all combinations, choose by AIC)
Note: only feasible for small numbers of predictors

In [7]:
def best_subset(X, y):
    best_aic = float('inf')
    best_features = None
    for k in range(1, len(X.columns) + 1):
        for combo in itertools.combinations(X.columns, k):
            model = sm.OLS(y, sm.add_constant(X[list(combo)])).fit()
            if model.aic < best_aic:
                best_aic = model.aic
                best_features = combo
    return list(best_features)

print('Best subset result:', best_subset(X, y))

Best subset result: ['X1', 'X3', 'X5']


### LASSO / Ridge / Elastic Net (regularized regression) — coefficients shown

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_arr = y.values

lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y_arr)
print('LASSO coefficients:', dict(zip(X.columns, lasso.coef_)))

ridge = Ridge(alpha=1.0)
ridge.fit(X_scaled, y_arr)
print('Ridge coefficients:', dict(zip(X.columns, ridge.coef_)))

elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_scaled, y_arr)
print('ElasticNet coefficients:', dict(zip(X.columns, elastic.coef_)))

LASSO coefficients: {'X1': np.float64(2.5948434080769287), 'X2': np.float64(0.0), 'X3': np.float64(-1.9669784652132452), 'X4': np.float64(0.0), 'X5': np.float64(0.48229038223647686)}
Ridge coefficients: {'X1': np.float64(2.5563340568544692), 'X2': np.float64(0.12180052424514573), 'X3': np.float64(-2.0484148433978207), 'X4': np.float64(0.002050395745368879), 'X5': np.float64(0.5878256901861695)}
ElasticNet coefficients: {'X1': np.float64(1.8501130680552267), 'X2': np.float64(0.7328703732138373), 'X3': np.float64(-1.9230862213059166), 'X4': np.float64(0.0), 'X5': np.float64(0.5291241461729258)}


You can tweak dataset size, noise, and alpha/threshold parameters and re-run to see how selection results change.