In [2]:
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.linear_model import LinearRegression

carseat = pd.read_csv("./data/Carseats.csv", index_col=0)
carseat.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
1,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
2,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
3,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
4,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
5,4.15,141,64,3,340,128,Bad,38,13,Yes,No


### (a) fit a multiple regression model to predict sales using Price, Urban, and US

In [13]:
# urban, us make into binary
carseat["Urban"] = carseat["Urban"].map({"Yes": 1, "No": 0})
carseat["US"] = carseat["US"].map({"Yes": 1, "No": 0})

In [14]:
model = LinearRegression()
model.fit(carseat[["Price", "Urban", "US"]], carseat["Sales"])

### (b) provide an interpretation of each coefficient in the model

In [15]:
coef = model.coef_
intercept = model.intercept_

print(f"Price coefficient: {coef[0]}")
print(f"Urban coefficient: {coef[1]}")
print(f"US coefficient: {coef[2]}")
print(f"Intercept: {intercept}")

Price coefficient: -0.0544588491775822
Urban coefficient: -0.021916150814140667
US coefficient: 1.2005726977941165
Intercept: 13.043468936764896


### (c) Write out the model in equation form, being careful to handle the qualitative variables properly

In [18]:
print("Y = B0 + B1X1 + B2X2 + B3X3")
print("Y = B0 + B1*Price + B2*Urban + B3*US")
print("Y = {:.2f} + {:.2f}*X1 + {:.2f}*X2 + {:.2f}*X3".format(intercept, coef[0], coef[1], coef[2]))

Y = B0 + B1X1 + B2X2 + B3X3
Y = B0 + B1*Price + B2*Urban + B3*US
Y = 13.04 + -0.05*X1 + -0.02*X2 + 1.20*X3


### (d) For which of the predictors can you reject the null hypothesis H0 : βj = 0?

In [23]:
import statsmodels.api as sm

X = carseat[["Price", "Urban", "US"]]
X = sm.add_constant(X)
y = carseat["Sales"]

model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.239
Model:                            OLS   Adj. R-squared:                  0.234
Method:                 Least Squares   F-statistic:                     41.52
Date:                Mon, 27 Mar 2023   Prob (F-statistic):           2.39e-23
Time:                        16:21:23   Log-Likelihood:                -927.66
No. Observations:                 400   AIC:                             1863.
Df Residuals:                     396   BIC:                             1879.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.0435      0.651     20.036      0.0

P>|t| for Price and US are both less than 0.05, so we can reject the null hypothesis for both of them.

whereas for Urban, P>|t| is 0.06, so we can't reject the null hypothesis for it.

which means that Price and US are significant predictors for sales, but Urban is not.

### (e) on the basis of your response to the previous question, fit a smaller model that only uses the predictors for which there is evidence of association with the outcome

In [26]:
import statsmodels.api as sm

X = carseat[["Price", "US"]]
X = sm.add_constant(X)
y = carseat["Sales"]

model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.239
Model:                            OLS   Adj. R-squared:                  0.235
Method:                 Least Squares   F-statistic:                     62.43
Date:                Mon, 27 Mar 2023   Prob (F-statistic):           2.66e-24
Time:                        16:30:10   Log-Likelihood:                -927.66
No. Observations:                 400   AIC:                             1861.
Df Residuals:                     397   BIC:                             1873.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.0308      0.631     20.652      0.0

compare to the previous model, the new model has a same R-squared value, but a lower AIC and BIC value, which means that the new model is better than the previous one.

* AIC = nlog(RSS/n) + 2p
* BIC = nlog(RSS/n) + plog(n)