In [1]:
import pandas as pd

In [2]:
advertising_data = pd.read_csv('../Data Sets/Advertising.csv')

In [4]:
advertising_data.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


$$
\text{Sales} = \beta_0 + \beta_1 \times \text{TV} + \beta_2 \times \text{Radio} + \beta_3 \times \text{Newspaper} + \epsilon
$$

In [7]:
# X = advertising_data[['TV', 'radio', 'newspaper']]
X = advertising_data.drop('sales', axis=1)

In [8]:
X.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [9]:
y = advertising_data['sales']

In [10]:
y.head()

0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
Name: sales, dtype: float64

Multiple Linear Regression using statsmodels

In [11]:
import statsmodels.api as sm

In [12]:
X_sm = sm.add_constant(X)

In [14]:
X_sm.head()

Unnamed: 0,const,TV,radio,newspaper
0,1.0,230.1,37.8,69.2
1,1.0,44.5,39.3,45.1
2,1.0,17.2,45.9,69.3
3,1.0,151.5,41.3,58.5
4,1.0,180.8,10.8,58.4


In [15]:
model_sm = sm.OLS(y, X_sm).fit()

In [16]:
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Fri, 28 Mar 2025   Prob (F-statistic):           1.58e-96
Time:                        11:24:25   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9389      0.312      9.422      0.0

In [17]:
import numpy as np

In [18]:
RSE = np.sqrt(model_sm.ssr / model_sm.df_resid)

In [19]:
RSE

1.685510373414744

In [21]:
unseen_X = pd.DataFrame({
    'TV': np.array([200, 150, 300]),
    'radio': np.array([50, 75, 100]),
    'newspaper': np.array([15, 30, 45])
})

In [22]:
unseen_X

Unnamed: 0,TV,radio,newspaper
0,200,50,15
1,150,75,30
2,300,100,45


In [24]:
predictions_sm = model_sm.predict(sm.add_constant(unseen_X))

In [25]:
predictions_sm

0    21.502757
1    23.912213
2    35.474598
dtype: float64

In [27]:
predictions_sm.name = 'Predictions'
pd.concat([unseen_X, predictions_sm], axis=1)

Unnamed: 0,TV,radio,newspaper,Predictions
0,200,50,15,21.502757
1,150,75,30,23.912213
2,300,100,45,35.474598


Multiple Linear Regression using scikit-learn

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
model_sk = LinearRegression()

In [30]:
model_sk.fit(X, y)

In [31]:
model_sk.intercept_

2.9388893694594085

In [32]:
model_sk.coef_

array([ 0.04576465,  0.18853002, -0.00103749])

In [33]:
predictions_sk = model_sk.predict(unseen_X)

In [36]:
type(predictions_sk)

numpy.ndarray

In [38]:
predictions_sk = pd.Series(predictions_sk)
predictions_sk.name = 'Predictions'
pd.concat([unseen_X, predictions_sk], axis=1)

Unnamed: 0,TV,radio,newspaper,Predictions
0,200,50,15,21.502757
1,150,75,30,23.912213
2,300,100,45,35.474598
