In [None]:
import os
user = os.getenv('USER')
os.chdir(f'/scratch/cd82/{user}/notebooks')

## Linear Regression - Cross Validation
Cross validation is a method for testing a model using all of the data available. It cycles through selection of a training and test data set, producing test metrics for each model. The test metrics are then used to confirm (or otherwise) that the samples are valid and the quality of the model. 
  
There are many different ways to split the data and Scikit-Learn has a broad range of methods. These include:
- KFold
- GroupKFold
- ShuffleSplit
- StratifiedKFold
- StratifiedGroupKFold
- GroupShuffleSplit
- StratifiedShuffleSplit
- TimeSeriesSplit

An example of how kfold cross validation (with k = 5) can split a data set is shown here:
  
<img src="./grid_search_cross_validation.png" alt="Sample Image" width="800" height="600">


ref:
[sklearn.model_selection.cross_validate.html](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Generate sample data
N = 30  # the number of samples to be created

# The seed for the random number generator.
seed_seq = np.random.SeedSequence(42) 
# Create a random number generator instance
rng = np.random.default_rng(seed_seq)

rndg = rng.normal(loc=0.0, scale=1, size=N)
rndg = rndg.reshape((N, 1))
print('rndg shape: ', rndg.shape)

# Create X data
start_x = 2.0
range_x = 2.0

X = np.linspace(start_x, start_x+range_x, num=N )
X = X.reshape((N, 1))
print('X shape: ', X.shape)

pc_rand = 0.75  # +- how much randomness to be added to y data

# Create y data
offset_y = 6.0
slope_y = 4.0

add_offset = (start_x * slope_y) + offset_y
# add_offset =  offset_y
y = (( add_offset )+ slope_y * (X - start_x)) + (pc_rand * rndg)
print('y shape: ',y.shape)



#### Choices for test metrics
The created sub-models produced by cross validation can be scored using different metrics. Choices are:
|||
|-------------------------------------|------------------------------|
|$R^2$                                 | 'r2'                         |
|Mean Absolute Error (MAE)            | 'neg_mean_absolute_error'    |
|Mean Squared Error (MSE)             | 'neg_mean_squared_error'     |
|Root Mean Squared Error (RMSE)       | 'neg_root_mean_squared_error'|
|Mean Absolute Percentage Error (MAPE)| 'neg_mean_absolute_percentage_error'|

In [None]:
# Train the model
model = LinearRegression()
scores = cross_val_score(model, X,y, cv=5, scoring='r2')

print("Cross-validation scores: ",scores)
# model.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

# The shuffle parameter is important if data has some internal structure
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# kf = KFold(n_splits=5, shuffle=False)
kf.get_n_splits(X)

print(kf)

# This code prints what samples are in either the test or train split for each fold
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

results = cross_validate(model, X, y, cv=kf, 
                         scoring=('r2', 'neg_mean_squared_error'),
                         return_train_score=True)

print("Train r^2 scores:", results['train_r2'])
print("Test r^2 scores:", results['test_r2'])

print("Train mse scores:", results['train_neg_mean_squared_error'])
print("Test mse scores:", results['test_neg_mean_squared_error'])


#### Respect groupings in the dataset
It is possible to respect groupings of samples in cross-validation by using the ```GroupKFold``` cross-validator

In [None]:
from sklearn.model_selection import GroupKFold

# Set an array to specify what sample belongs to what group
groups = [i // 5 for i in range(N)]
print('Groups = ', groups)
group_kfold = GroupKFold(n_splits=4, shuffle=True, random_state=42)

# Perform cross-validation
results = cross_validate(model, X, y, cv=group_kfold, 
                         groups=groups,
                         scoring=('r2', 'neg_mean_squared_error'),
                         return_train_score=True)

for i, (train_index, test_index) in enumerate(group_kfold.split(X, groups=groups)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    
print("Train scores:", results['train_r2'])
print("Test scores:", results['test_r2'])



#### Detect outliers in the data


In [None]:
from sklearn.model_selection import cross_val_predict
from scipy.stats import zscore

# Create an outlier in the data
y[17] = y[16] + 3.0

# kf = KFold(n_splits=5, shuffle=False, random_state=None )
kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(model, X, y, cv=kf)

for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

# print(predictions)
# Calculate residuals
residuals = y - y_pred

# Detect outliers using z-score (standardised by mean and sd)
z_scores = zscore(residuals)

# This test finds data points that are more than 3 standard 
# deviations from the mean indicating they are probable outliers
outliers = np.where(np.abs(z_scores) > 3)[0]

print("Outliers detected at indices:", outliers)


In [None]:
# Evaluate the model
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Train the model
model_skl = LinearRegression()
model_skl.fit(X, y)

In [None]:
y_pred_skl = model_skl.predict(X)


plt.plot(X, y_pred_skl, color='red', 
         linewidth=1, label='Regression line (SKLearn)')

# Plot the results
plt.scatter(X, y, s=10, color='blue', label='Actual data')

plt.plot(X, y_pred, color='black', 
         linewidth=1, label='Regression line (cross_val_predict())')

plt.scatter(X[outliers], y[outliers], s=10, color='red', label='Outlier data')

plt.xlabel('X')
plt.ylabel('y')
plt.title('Simple Linear Regression')
plt.legend()
plt.show()