In [None]:
import os
user = os.getenv('USER')
os.chdir(f'/scratch/cd82/{user}/notebooks')

## Linear Regression - Multiple Linear Regression (MLR)
In this section:
- We look at models with more than a single independant/predictor variable.
- We will use a train-test split of the data so as to test our model.
- We also look at how to select the best predictor variables using t-test scores and their p-values
- We will look at automated methods of reducing the number of predictor variables using *Regularisation*.

#### The multiple linear regression equation:

<div style="border: 0px solid black; padding: 2px; margin: 0;">
$$
{y} = {\beta}_{0} + {x}_{1} {\beta}_{1} + {x}_{2} {\beta}_{2}  + ... + {x}_{n} {\beta}_{n}  + {\epsilon}
$$
</div>

Where:
- ${y}$ is the dependent variable that we're trying to predict
- ${x}_{1}$, ${x}_{2}$, ..., ${x}_{n}$ are the independent variables
- ${\beta}_{0}$ is the y-intercept (bias)
- ${\beta}_{1}$, ${\beta}_{2}$, ..., ${\beta}_{n}$ are the coefficients (weights) for each feature
- ${\epsilon}$ is the error term
<br>
**In matrix form**:

<div style="border: 0px solid black; padding: 2px; margin: 0;">
$$
\mathbf{y} = \mathbf{X} \boldsymbol{\beta} + \boldsymbol{\epsilon}
$$
</div>
    
$\mathbf{y}$ &emsp;&emsp;A (column) vector of responses<br>
$\mathbf{X}$&emsp;&emsp; A matrix of independent variables  
                          (row for each sample, column for each independant variable)<br>
$\boldsymbol{\beta}$&emsp;&emsp; A vector of coefficients.<br>
 
#### Equation to solve for ${\beta}$:
<div style="border: 0px solid black; padding: 2px; margin: 0;">
$$
\boldsymbol{\beta} = (\mathbf{X}^T \mathbf{X})^{-1}\mathbf{X}^T \mathbf{y}
$$
</div>

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib import pyplot as plt

### Generate data
Again, our example is going to use synthetic data i.e. data derived from a known distribution.  
This time we will use the Skikit-Learn function ```make_regression```

In [None]:
from sklearn.datasets import make_regression
# Generate synthetic data
X, y = make_regression(
    n_samples=200, 
    n_features=10, 
    n_informative=5, 
    noise=5,
    bias=100.0,
    random_state=42)

# Add a constant to the model (intercept)
X_int = sm.add_constant(X)

plt.figure(figsize=(8, 4))

# Create a box and whisker plot for each feature
X_df = pd.DataFrame(X_int)

# Create a box and whisker plot for each feature
X_df.boxplot()
plt.title('Synthetic MLR data')
plt.xticks(rotation=45)
plt.ylabel('Values')
plt.grid(True)
plt.show()


##### Plot the Y data

In [None]:
plt.figure(figsize=(4, 4))

# Create a box and whisker plot for each feature
# X_df = pd.DataFrame(X_int)
y_df = pd.DataFrame(y, columns=['y'])

# Create a box and whisker plot for each feature
y_df.boxplot()
plt.title('Y data')
plt.xticks(rotation=45)
plt.ylabel('Values')
plt.grid(True)
plt.show()

#### Splitting our input data
We will split our dataset into a *training* set and a *testing* set.
- This helps prevent our models from being 'over-fit' by the training data.
- The split out data means we can validate our model with data that is external to the data the model was trained on.
- It helps to produce models that are transferable to new data.

Over-fitting datasets is not such a problem in linear regression methods, however gets more important in more complex machine learning methods.

In [None]:
# Split the data
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_int, y, 
    test_size=0.2, 
    random_state=42)

print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

#### MLR regression using ```statsmodels```

In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

# Fit the linear regression model
model = sm.OLS(y_train,X_train)
results_mlr = model.fit()

# Get the R-squared value
r_squared = results_mlr.rsquared
print('R sqrd (extracted):', r_squared)

# Print the summary of the model
print(results_mlr.summary())


N.B. Hight absolute  t-test scores and small p-values indicate the predictor variable is significant - i.e.  the variable is contributing to the predictive power of the model.

In [None]:
# Make predictions
y_pred_mlr = results_mlr.predict(X_test)

In [None]:
# Evaluate the model on test data
y_test_ave = np.average(y_train) 
mse = mean_squared_error(y_test, y_pred_mlr)
r2 = r2_score(y_test, y_pred_mlr)

print(f"Average Y value: {y_test_ave}")
print(f"Mean Squared Error: {mse}")
print(f"Mean Error: {np.sqrt(mse)}")
print(f"R² Score: {r2}")

## Visualisation

Slightly modified the above code to add `residuals` and model `coefficients`.

In [None]:
# added residuals
residuals = y_test - y_pred_mlr

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_mlr)
r2 = r2_score(y_test, y_pred_mlr)
# coefficients = model.coef_
coefficients = results_mlr.params
# Create a DataFrame with coefficients and p-values
summary_table = pd.DataFrame({
    'Coefficient': results_mlr.params,
    'P-Value': results_mlr.pvalues
})
print(summary_table)

In [None]:
# Create a figure and a set of subplots
fig, axs = plt.subplots(2, 1, figsize=(8, 10))

# Coefficients plot
# axs[0].bar(X.columns, coefficients)
bars = axs[0].bar(summary_table.index, summary_table['Coefficient'], color='skyblue')
axs[0].set_title('Feature Coefficients')

# Add p-values on each bar
for bar, p_value in zip(bars, summary_table['P-Value']):
    height = bar.get_height()
    axs[0].text(bar.get_x() + bar.get_width() / 2, height,
            f'{p_value:.2e}', ha='center', va='bottom')

axs[0].set_xlabel('Variables')
axs[0].set_ylabel('Coefficients')
axs[0].set_title('Coefficients with P-Values')

axs[0].set_xlabel('Features')
axs[0].set_ylabel('Coefficients')

# Residuals vs Predictions plot
axs[1].scatter(y_pred_mlr, residuals)

axs[1].set_title('Residuals vs Predictions')
axs[1].set_xlabel('Predictions')
axs[1].set_ylabel('Residuals')

# Show the plots
plt.tight_layout()
plt.show()


The first plot is a bar chart of feature coefficients. This gives a good view of the effect each feature has on the prediction. A feature with a higher coefficient has a larger effect than a feature with a lower coefficient.

The second plot is a scatter plot of residuals versus predictions. Ideally, the residuals should be randomly spread around the centerline. If there are any patterns in the residuals, it signals that the model could be improved by including non-linear terms or interaction terms.

###  Regularisation
Regularisation methods use differing penalisation functions, based around 'distance' calculations of the coefficients.

#### Vector norms
Some common distance metrics (vector norms) are list here:
<div style="border: 0px solid black; padding: 2px; margin: 0;">
L1 norm (Manhattan distance or Taxicab distance): 
$$
L_1 = ||\mathbf{v}||_1 =  \sum_{i=1}^{n} |v|
$$
</div>

<div style="border: 0px solid black; padding: 2px; margin: 0;">
L2 norm (Euclidean norm): 
$$
L_2 = ||\mathbf{v}||_2 =  \sqrt{\sum_{i=1}^{n} v^2}
$$
</div>

<div style="border: 0px solid black; padding: 2px; margin: 0;">  
L infintiy norm:
$$
L_\infty = ||\mathbf{v}||_\infty =  \max_i|v|
$$

</div>

## Lasso (*L1*) Regression
Cost Function = RSS + λ × (sum of absolute values of coefficients)
$$
Cost Function = \sum_{i=1}^{n}(y - \hat{y_i} )^2 + \alpha L_1
$$

Where:
- $\alpha$ (alpha) is the regularization parameter.

A higher value of $\alpha$ increases the penalty on large coefficients.  
  
Lasso regression needs to solved using a gradient descent type algorithm.


```python
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)
lasso_reg.predict([[1.5]])
```


```python
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(penalty="l1")
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])
```

## Ridge (*L2*) Regression
Cost Function = RSS + $\alpha$ ×  (sum of squared values of coefficients = ($L_2^2$)  )   
(N.B. the removal of the square root)  

$$
Cost Function = \sum_{i=1}^{n}(y - \hat{y_i} )^2 + \alpha L_2^2
$$
Where:
- $\alpha$ (alpha) is the regularization parameter

A higher value of $\alpha$ increases the penalty on large coefficients.

Ridge regression has a closed form.
To solve for ${\beta}$:
<div style="border: 0px solid black; padding: 2px; margin: 0;">
$$
\boldsymbol{\beta} = (\mathbf{X}^T \mathbf{X} + \lambda \mathbf{A})^{-1} \mathbf{X}^T \mathbf{y}
$$
</div>
  
<div style="border: 0px solid black; padding: 2px; margin: 0;">
$$
\mathbf{A} = \begin{pmatrix}
0 & 0 & 0 & \cdots & 0 \\
0 & 1 & 0 & \cdots & 0 \\
0 & 0 & 1 & \cdots & 0 \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
0 & 0 & 0 & \cdots & 1
\end{pmatrix}
$$
</div>
In $\mathbf{A}$ the top left entry is 0 so we do not penalise the offset (intercept) value.
  
```python
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)
ridge_reg.predict([[1.5]])
```

```python
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(penalty="l2")
sgd_reg.fit(X, y.ravel())
sgd_reg.predict([[1.5]])


```
## Elasticnet (*L1 & L2*) Regression
$$
Cost Function = \sum_{i=1}^{n}(y - \hat{y_i} )^2 + {r}\alpha L_1 + {(1 - r)}\alpha L_2^2
$$

In long form: 
$$
Cost Function = \sum_{i=1}^{n}(y - \hat{y_i} )^2 + {r}\alpha \sum_{i=1}^{n} |\beta| + {(1 - r)}\alpha \sum_{i=1}^{n} \beta^2
$$
   
Where:
- ${r}$ is the mix ratio of ${L1}$ to ${L2}$
  
```python
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X, y)
elastic_net.predict([[1.5]])

# or using statsmodels
import statsmodels.api as sm
# L1_wt == 1.0 then it is Lasso
# L1_wt == 0.0 then it is Ridge regression
lasso_model = sm.OLS(y, X).fit_regularized(method='elastic_net', 
                                         alpha=0.1, L1_wt=1.0)

ridge_model = sm.OLS(y, X).fit_regularized(method='elastic_net', 
                                         alpha=0.1, L1_wt=0.0)
 ```
  

#### Selection of coefficents in the original dataset using Lasso

In [None]:
lasso_model = sm.OLS(y_train, X_train).fit_regularized(method='elastic_net', 
                                         alpha=1.0, L1_wt=1.0)

non_zero_coefficients = np.sum(lasso_model.params != 0)
print(f"Number of non-zero coefficients: {non_zero_coefficients}")


In [None]:
# Make predictions
y_test_pred = lasso_model.predict(X_test)
# print("Predictions:", y_test_pred)

mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


In [None]:
# Extract non-zero coefficients
non_zero_indices = np.where(lasso_model.params != 0)[0]
selected_predictors = X_train[:, non_zero_indices]  # subset our full dataset with our selected 

# Optionally refit the model with selected predictors
refit_model = sm.OLS(y_train, selected_predictors).fit()

print("Selected predictors:", non_zero_indices)
print("Refitted model summary:", refit_model.summary())


#### Finding the best values in the regularisation
As we now have various hyperparameters to choose, the modelling task gets a little more laborious. Tha is where GridSearchCV comes in.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

# Set up the parameter grid
param_grid = {
    'alpha': [0.1, 0.25, 0.5, 0.75, 1.0, 5.0, 10.0, 20.0],
}

# Initialize the Lasso model
lasso = Lasso()

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, 
                           cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score (r^2): ", grid_search.best_score_)

# Fit the model with the best parameters
best_lasso = grid_search.best_estimator_
best_lasso.fit(X_train, y_train)

coefficients = best_lasso.coef_
non_zero_indices = np.where(lasso_model.params != 0)[0]
print("non_zero_indices coefficients:", non_zero_indices)


# Make predictions on the test set
y_pred_gridcv = best_lasso.predict(X_test)

# print("Predictions on test data: ", y_pred_gridcv)
