In [419]:
pip install mlxtend

Note: you may need to restart the kernel to use updated packages.


In [420]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlxtend
from mlxtend.feature_selection import SequentialFeatureSelector

In [421]:
df = pd.read_csv("Carseats.csv")

In [422]:
df['Sales'] = df['Sales'].astype(int)

In [423]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11,111,48,16,260,83,Good,65,10,Yes,Yes
2,10,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4,141,64,3,340,128,Bad,38,13,Yes,No


In [424]:
df = df.drop(columns = ['ShelveLoc', 'Urban', 'US'])

In [425]:
df.shape

(400, 8)

In [426]:
df.isnull().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
Age            0
Education      0
dtype: int64

In [427]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education


In [428]:
X = df[['CompPrice', 'Income', 'Population', 'Advertising', 'Price', 'Age', 'Education']]
y = df['Sales']

In [429]:
X.shape

(400, 7)

In [430]:
X.head()

Unnamed: 0,CompPrice,Income,Population,Advertising,Price,Age,Education
0,138,73,276,11,120,42,17
1,111,48,260,16,83,65,10
2,113,35,269,10,80,59,12
3,117,100,466,4,97,55,14
4,141,64,340,3,128,38,13


In [431]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [432]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [433]:
from sklearn.ensemble import RandomForestClassifier

In [434]:
X_train.shape

(280, 7)

In [435]:
forward_feature_selection = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
                                                     k_features= 6,
                                                     forward=True,
                                                     floating=False,
                                                     verbose=2,
                                                     scoring= "accuracy",
                                                     cv= 5).fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.8s finished

[2024-06-21 16:16:44] Features: 1/6 -- score: 0.14285714285714285[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.0s finished

[2024-06-21 16:16:45] Features: 2/6 -- score: 0.13214285714285717[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished

[2024-06-21 16:16:45] Features: 3/6 -- score: 0.13214285714285715[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [436]:
forward_feature_selection.k_feature_idx_

(0, 1, 3, 4, 5, 6)

In [437]:
forward_feature_selection.k_feature_names_

('CompPrice', 'Income', 'Advertising', 'Price', 'Age', 'Education')

In [438]:
forward_feature_selection.k_score_

0.1285714285714286

In [439]:
pd.DataFrame.from_dict(forward_feature_selection.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(3,)","[0.14285714285714285, 0.16071428571428573, 0.1...",0.142857,"(Advertising,)",0.032458,0.025254,0.012627
2,"(0, 3)","[0.08928571428571429, 0.05357142857142857, 0.1...",0.132143,"(CompPrice, Advertising)",0.069008,0.05369,0.026845
3,"(0, 3, 4)","[0.07142857142857142, 0.125, 0.214285714285714...",0.132143,"(CompPrice, Advertising, Price)",0.060898,0.04738,0.02369
4,"(0, 3, 4, 6)","[0.17857142857142858, 0.10714285714285714, 0.1...",0.15,"(CompPrice, Advertising, Price, Education)",0.062604,0.048708,0.024354
5,"(0, 3, 4, 5, 6)","[0.10714285714285714, 0.14285714285714285, 0.1...",0.135714,"(CompPrice, Advertising, Price, Age, Education)",0.023406,0.018211,0.009105
6,"(0, 1, 3, 4, 5, 6)","[0.16071428571428573, 0.10714285714285714, 0.1...",0.128571,"(CompPrice, Income, Advertising, Price, Age, E...",0.036723,0.028571,0.014286


In [440]:
forward_feature_selection = SequentialFeatureSelector(RandomForestClassifier(n_jobs=-1),
                                                     k_features= (1,7),
                                                     forward=True,
                                                     floating=False,
                                                     verbose=2,
                                                     scoring= "accuracy",
                                                     cv= 5).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.1s finished

[2024-06-21 16:16:48] Features: 1/7 -- score: 0.1392857142857143[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.0s finished

[2024-06-21 16:16:49] Features: 2/7 -- score: 0.1392857142857143[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished

[2024-06-21 16:16:50] Features: 3/7 -- score: 0.13571428571428573[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1

In [441]:
forward_feature_selection.k_feature_names_

('Advertising',)

In [442]:
forward_feature_selection.k_score_

0.1392857142857143

In [443]:
pd.DataFrame.from_dict(forward_feature_selection.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(3,)","[0.14285714285714285, 0.14285714285714285, 0.1...",0.139286,"(Advertising,)",0.030449,0.02369,0.011845
2,"(0, 3)","[0.125, 0.05357142857142857, 0.196428571428571...",0.139286,"(CompPrice, Advertising)",0.063936,0.049744,0.024872
3,"(0, 3, 4)","[0.10714285714285714, 0.10714285714285714, 0.1...",0.135714,"(CompPrice, Advertising, Price)",0.034351,0.026726,0.013363
4,"(0, 3, 4, 6)","[0.14285714285714285, 0.08928571428571429, 0.1...",0.135714,"(CompPrice, Advertising, Price, Education)",0.04726,0.03677,0.018385
5,"(0, 2, 3, 4, 6)","[0.16071428571428573, 0.08928571428571429, 0.1...",0.139286,"(CompPrice, Population, Advertising, Price, Ed...",0.044505,0.034626,0.017313
6,"(0, 1, 2, 3, 4, 6)","[0.17857142857142858, 0.125, 0.196428571428571...",0.132143,"(CompPrice, Income, Population, Advertising, P...",0.062604,0.048708,0.024354
7,"(0, 1, 2, 3, 4, 5, 6)","[0.10714285714285714, 0.05357142857142857, 0.1...",0.1,"(CompPrice, Income, Population, Advertising, P...",0.034351,0.026726,0.013363


In [444]:
import statsmodels.api as sm

In [445]:
X = sm.add_constant(X_train)
model = sm.OLS(y_train, X)
model_fit = model.fit()
print(model_fit.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.528
Model:                            OLS   Adj. R-squared:                  0.516
Method:                 Least Squares   F-statistic:                     43.54
Date:                Fri, 21 Jun 2024   Prob (F-statistic):           5.60e-41
Time:                        16:16:52   Log-Likelihood:                -588.10
No. Observations:                 280   AIC:                             1192.
Df Residuals:                     272   BIC:                             1221.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           6.2947      1.358      4.636      