In [1]:
# Roger H Hayden III
# Johns Hopkins University
# Statistical Models and Regression Module 9 & 10 Assignment
# 03/21/2023

In [2]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd

from sklearn.linear_model import LinearRegression
import sklearn
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [3]:
df = pd.read_excel(r'C:\Users\roger\OneDrive\Desktop\Education\Johns Hopkins Courses\Spring 2023\Statistical Models and Regression\Module9&10Assignment_Roger_Hayden\Data Table B.2.xlsx')
df

Unnamed: 0,y,x1,x2,x3,x4,x5
0,271.8,783.35,33.53,40.55,16.66,13.2
1,264.0,748.45,36.5,36.19,16.46,14.11
2,238.8,684.45,34.66,37.31,17.66,15.68
3,230.7,827.8,33.13,32.52,17.5,10.53
4,263.9,909.45,34.6,34.85,16.06,11.96
5,266.5,905.55,35.38,35.89,15.93,12.58
6,229.1,756.0,35.85,33.53,16.6,10.66
7,239.3,769.35,35.68,33.79,16.41,10.85
8,258.0,793.5,35.35,34.72,16.17,11.41
9,257.6,801.65,35.04,35.22,15.92,11.91


In [4]:
X = df.iloc[:, 1:6]
X

Unnamed: 0,x1,x2,x3,x4,x5
0,783.35,33.53,40.55,16.66,13.2
1,748.45,36.5,36.19,16.46,14.11
2,684.45,34.66,37.31,17.66,15.68
3,827.8,33.13,32.52,17.5,10.53
4,909.45,34.6,34.85,16.06,11.96
5,905.55,35.38,35.89,15.93,12.58
6,756.0,35.85,33.53,16.6,10.66
7,769.35,35.68,33.79,16.41,10.85
8,793.5,35.35,34.72,16.17,11.41
9,801.65,35.04,35.22,15.92,11.91


In [5]:
y = df['y']
y

0     271.8
1     264.0
2     238.8
3     230.7
4     263.9
5     266.5
6     229.1
7     239.3
8     258.0
9     257.6
10    267.3
11    267.0
12    196.0
13    278.7
14    272.3
15    267.4
16    254.5
17    224.7
18    181.5
19    227.5
Name: y, dtype: float64

# Part A - Forward Selection

In [6]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
            
        min_p_value = new_pval.min()
        
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
            
        else:
            break
            
    return best_features

In [7]:
forward_selection(X,y)

  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)
  new_pval = pd.Series(index=remaining_features)


['x4', 'x3']

In [12]:
sfs_forward = SFS(LinearRegression(),
          k_features=5,
          forward=True,
          floating=False,
          scoring = 'r2',
          cv = 0).fit(X, y)

sfs_forward.k_feature_names_

('x1', 'x2', 'x3', 'x4', 'x5')

In [13]:
sfs_forward.subsets_

{1: {'feature_idx': (3,),
  'cv_scores': array([0.73214942]),
  'avg_score': 0.7321494246942934,
  'feature_names': ('x4',)},
 2: {'feature_idx': (2, 3),
  'cv_scores': array([0.85959238]),
  'avg_score': 0.8595923782383846,
  'feature_names': ('x3', 'x4')},
 3: {'feature_idx': (1, 2, 3),
  'cv_scores': array([0.87145264]),
  'avg_score': 0.8714526449923055,
  'feature_names': ('x2', 'x3', 'x4')},
 4: {'feature_idx': (0, 1, 2, 3),
  'cv_scores': array([0.89302493]),
  'avg_score': 0.8930249252198001,
  'feature_names': ('x1', 'x2', 'x3', 'x4')},
 5: {'feature_idx': (0, 1, 2, 3, 4),
  'cv_scores': array([0.90082642]),
  'avg_score': 0.9008264195987536,
  'feature_names': ('x1', 'x2', 'x3', 'x4', 'x5')}}

# Part B - Backward Selection

In [14]:
sfs_backward = SFS(LinearRegression(),
          k_features=1,
          forward=False,
          floating=False,
          scoring = 'r2',
          cv = 0).fit(X, y)

sfs_backward.k_feature_names_

('x4',)

In [15]:
sfs_backward.subsets_

{5: {'feature_idx': (0, 1, 2, 3, 4),
  'cv_scores': array([0.90082642]),
  'avg_score': 0.9008264195987536,
  'feature_names': ('x1', 'x2', 'x3', 'x4', 'x5')},
 4: {'feature_idx': (0, 1, 2, 3),
  'cv_scores': array([0.89302493]),
  'avg_score': 0.8930249252198001,
  'feature_names': ('x1', 'x2', 'x3', 'x4')},
 3: {'feature_idx': (1, 2, 3),
  'cv_scores': array([0.87145264]),
  'avg_score': 0.8714526449923055,
  'feature_names': ('x2', 'x3', 'x4')},
 2: {'feature_idx': (2, 3),
  'cv_scores': array([0.85959238]),
  'avg_score': 0.8595923782383846,
  'feature_names': ('x3', 'x4')},
 1: {'feature_idx': (3,),
  'cv_scores': array([0.73214942]),
  'avg_score': 0.7321494246942934,
  'feature_names': ('x4',)}}