In [10]:
import pandas as pd
import numpy as np

data = pd.read_csv('auto-mpg.csv')

data.dtypes
acc = data['acceleration']
logdisp = np.log(data['displacement'])
loghorse = np.log(data['horsepower'])
logweight = np.log(data['weight'])

scaled_acc = (acc-min(acc))/(max(acc)-min(acc)) 
scaled_disp = (logdisp-np.mean(logdisp))/np.sqrt(np.var(logdisp))
scaled_horse = (loghorse-np.mean(loghorse))/(max(loghorse)-min(loghorse))
scaled_weight = (logweight-np.mean(logweight))/np.sqrt(np.var(logweight))

data_fin = pd.DataFrame([])
data_fin['acc'] = scaled_acc
data_fin['disp'] = scaled_disp
data_fin['horse'] = scaled_horse
data_fin['weight'] = scaled_weight
cyl_dummies = pd.get_dummies(data['cylinders'], prefix='cyl', drop_first=True)
yr_dummies = pd.get_dummies(data['model year'], prefix='yr', drop_first=True)
orig_dummies = pd.get_dummies(data['origin'], prefix='orig', drop_first=True)
mpg = data['mpg']
data_fin = pd.concat([mpg, data_fin, cyl_dummies, yr_dummies, orig_dummies], axis=1)
data_fin.head()

Unnamed: 0,mpg,acc,disp,horse,weight,cyl_4,cyl_5,cyl_6,cyl_8,yr_71,...,yr_75,yr_76,yr_77,yr_78,yr_79,yr_80,yr_81,yr_82,orig_2,orig_3
0,18.0,0.238095,1.125829,0.173727,0.720986,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,0.208333,1.372223,0.32186,0.908047,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,0.178571,1.191999,0.262641,0.651205,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,0.238095,1.10737,0.262641,0.648095,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,0.14881,1.094964,0.219773,0.664652,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
data_ols = pd.concat([mpg, scaled_acc, scaled_weight, orig_dummies], axis=1)
data_ols.head(3)

Unnamed: 0,mpg,acceleration,weight,orig_2,orig_3
0,18.0,0.238095,0.720986,0,0
1,15.0,0.208333,0.908047,0,0
2,18.0,0.178571,0.651205,0,0


In [17]:
from statsmodels.formula.api import ols

outcome = 'mpg'
predictors = data_fin.drop('mpg', axis=1)
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=data_fin).fit()
model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.894
Model:,OLS,Adj. R-squared:,0.887
Method:,Least Squares,F-statistic:,141.0
Date:,"Fri, 24 Jun 2022",Prob (F-statistic):,1.4600000000000002e-164
Time:,15:02:41,Log-Likelihood:,-921.93
No. Observations:,392,AIC:,1890.0
Df Residuals:,369,BIC:,1981.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,14.1254,1.727,8.182,0.000,10.730,17.520
acc,-3.9913,1.533,-2.604,0.010,-7.006,-0.977
disp,-1.4917,0.767,-1.946,0.052,-2.999,0.016
horse,-11.2563,2.305,-4.884,0.000,-15.789,-6.724
weight,-2.7265,0.608,-4.485,0.000,-3.922,-1.531
cyl_4,8.1204,1.530,5.307,0.000,5.111,11.130
cyl_5,8.8237,2.262,3.901,0.000,4.376,13.271
cyl_6,7.6660,1.807,4.243,0.000,4.113,11.219
cyl_8,10.0134,1.983,5.050,0.000,6.114,13.912

0,1,2,3
Omnibus:,42.861,Durbin-Watson:,1.711
Prob(Omnibus):,0.0,Jarque-Bera (JB):,108.998
Skew:,0.534,Prob(JB):,2.14e-24
Kurtosis:,5.353,Cond. No.,45.1


In [13]:
import statsmodels.api as sm

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.argmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [18]:
result = stepwise_selection(predictors, data_fin['mpg'], verbose=True)
print('resulting features:')
print(result)



Add  weight                         with p-value 1.16293e-107
Add  yr_80                          with p-value 7.52653e-18
Add  yr_82                          with p-value 1.28402e-16
Add  yr_81                          with p-value 1.12539e-14
Add  yr_79                          with p-value 9.69425e-12
Add  horse                          with p-value 2.03073e-06
Add  cyl_6                          with p-value 3.35234e-05
Add  yr_78                          with p-value 6.1539e-06
Add  yr_77                          with p-value 7.59073e-05
Add  yr_73                          with p-value 0.00998426
resulting features:
['weight', 'yr_80', 'yr_82', 'yr_81', 'yr_79', 'horse', 'cyl_6', 'yr_78', 'yr_77', 'yr_73']


In [19]:
['weight', 'yr_80', 'yr_82', 'yr_81', 'yr_79', 'horse', 'cyl_6', 'yr_78', 'yr_77', 'yr_73']

outcome = 'mpg'
predictors = data_fin[['weight', 'yr_80', 'yr_82', 'yr_81', 'yr_79', 'horse', 'cyl_6', 'yr_78', 'yr_77', 'yr_73']]
pred_sum = '+'.join(predictors.columns)
formula = outcome + '~' + pred_sum

model = ols(formula=formula, data=data_fin).fit()
model.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.878
Model:,OLS,Adj. R-squared:,0.875
Method:,Least Squares,F-statistic:,274.5
Date:,"Fri, 24 Jun 2022",Prob (F-statistic):,2.59e-167
Time:,15:04:12,Log-Likelihood:,-948.68
No. Observations:,392,AIC:,1919.0
Df Residuals:,381,BIC:,1963.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,21.7577,0.227,95.674,0.000,21.311,22.205
weight,-3.9348,0.308,-12.766,0.000,-4.541,-3.329
yr_80,8.3765,0.593,14.135,0.000,7.211,9.542
yr_82,6.9604,0.560,12.427,0.000,5.859,8.062
yr_81,5.7061,0.576,9.911,0.000,4.574,6.838
yr_79,4.1337,0.557,7.426,0.000,3.039,5.228
horse,-8.4346,1.454,-5.799,0.000,-11.294,-5.575
cyl_6,-1.7441,0.369,-4.731,0.000,-2.469,-1.019
yr_78,2.3922,0.509,4.702,0.000,1.392,3.393

0,1,2,3
Omnibus:,22.134,Durbin-Watson:,1.691
Prob(Omnibus):,0.0,Jarque-Bera (JB):,50.005
Skew:,0.262,Prob(JB):,1.39e-11
Kurtosis:,4.669,Cond. No.,11.3


In [1]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors, outcome, test_size=0.30 , random_state=1)

NameError: name 'predictors' is not defined

In [None]:
from sklearn.model_selection import cross_val_score
cv_Score11 = cross_val_score(linearregression, X_train, y_train, cv = 10)