In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv("../data_sets/Advertising.csv", index_col=0)
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [3]:
number_of_samples, number_of_features = (
    df.shape[0],
    df.shape[1] - 1,
    # -1 because Sales is label and not a feature

)
number_of_samples, number_of_features

(200, 3)

In [4]:
df.shape

(200, 4)

In [5]:
x, y = df.drop("Sales", axis="columns"), df["Sales"]
x.head()

Unnamed: 0,TV,Radio,Newspaper
1,230.1,37.8,69.2
2,44.5,39.3,45.1
3,17.2,45.9,69.3
4,151.5,41.3,58.5
5,180.8,10.8,58.4


In [6]:
y.head()

1    22.1
2    10.4
3     9.3
4    18.5
5    12.9
Name: Sales, dtype: float64

## SKlearn - typical steps
1. train|test split, sometimes train|val|test split
2. scaling sometimes required
    -  min-max scaling
    - Standardization
    - ...
    - Scale the training data
    - Scale test data to the training data --> avoiding data leakage
3. Fit algorithm to training data- model training
4. Predict test data
5. Evaluate

### Train|test split


In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((140, 3), (60, 3), (140,), (60,))

### Feature scaling
Normalisation (min-max feature scaling)
- $X' = \frac{X-X_{\min}}{X_{max}-X_{min}}$

Feature standardization
- $X' = \frac{X-\mu}{\sigma}$


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Instantiate a scaler instance
scaler= MinMaxScaler()
scaler.fit(x_train) #Important- use this for training data

scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)

print(f"{scaled_x_train.min()=}")
print(f"{scaled_x_train.max()=}")
print(f"{scaled_x_test.min()=}")
print(f"{scaled_x_test.max()=}")

scaled_x_train.min()=0.0
scaled_x_train.max()=1.0
scaled_x_test.min()=0.005964214711729622
scaled_x_test.max()=1.1302186878727631


In [10]:
scaled_x_train.shape, scaled_x_test.shape

((140, 3), (60, 3))

### Linear regression
#### Ordinary least squares

In [12]:
from sklearn.linear_model import LinearRegression

model_OLS = LinearRegression()
model_OLS.fit(scaled_x_train, y_train) # fit on training data 
print(f"Parameters{model_OLS.coef_}") #beta1, beta2, beta3
print(f"intercept{model_OLS.intercept_}") #beta0


Parameters[13.02832938  9.88465985  0.69237469]
intercept2.741855324852823


### Stochastic gradient descent 

In [16]:
from sklearn.linear_model import SGDRegressor

model_SGD= SGDRegressor(loss="squared_error", max_iter= 10000)
model_SGD.fit(scaled_x_train, y_train)
print(print(f"Parameters{model_SGD.coef_}")) #beta1, beta2, beta3
print(f"intercept{model_SGD.intercept_}") #beta0


Parameters[11.95472216  8.99283203  1.33445173]
None
intercept[3.58924989]


### Manual prediction


In [17]:
test_sample_features = scaled_x_test[0].reshape(1,-1)
test_sample_label = y_test.values[0]
test_sample_features, test_sample_label

(array([[0.54988164, 0.63709677, 0.52286282]]), 16.9)

In [19]:
test_sample_features.shape

(1, 3)

In [20]:
model_OLS.predict(test_sample_features)[0]



16.565396297434837

In [21]:
model_SGD.predict(test_sample_features)[0]


16.58997155436136

In [22]:
x_test.iloc[0].to_numpy()

array([163.3,  31.6,  52.9])

### Evaluation

In [26]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# 1. predict on test data

y_pred_OLS = model_OLS.predict(scaled_x_test)
y_pred_SGD= model_SGD.predict(scaled_x_test)
y_pred_OLS[:5]

array([16.5653963 , 21.18822792, 21.55107058, 10.88923816, 22.20231988])

In [28]:
y_pred_SGD[:5]

array([16.58997155, 20.80677227, 21.10252838, 11.31979109, 21.38987547])

In [29]:
y_test[:5].values

array([16.9, 22.4, 21.4,  7.3, 24.7])

In [30]:
mae_OLS = mean_absolute_error(y_test, y_pred_OLS)
mae_SGD = mean_absolute_error(y_test, y_pred_SGD)

mae_OLS = mean_squared_error(y_test, y_pred_OLS)
mae_SGD = mean_squared_error(y_test, y_pred_SGD)

rmse_OLS = np.sqrt(mae_OLS)
rmse_SGD = np.sqrt(mae_SGD)

print(f"{mae_OLS=:.4f} \t\t {mae_OLS=:.4f} \t {rmse_OLS=:.4f}")
print(f"{mae_SGD=:.4f} \t\t {mae_SGD=:.4f} \t {rmse_SGD=:.4f}")



mae_OLS=3.7968 		 mae_OLS=3.7968 	 rmse_OLS=1.9485
mae_SGD=4.0867 		 mae_SGD=4.0867 	 rmse_SGD=2.0216
