## Step 0: Tiny Dataset

In [1]:
import numpy as np

In [2]:
# Tiny dataset for regression (6 datapoints)

X = np.array(
    [
        [1.0], 
        [2.0], 
        [3.0], 
        [4.0], 
        [5.0], 
        [6.0]
    ]
)

# target (y_actual)
y = np.array(
    [2.2, 3.9, 6.1, 7.8, 10.2, 11.7]
) # roughly y = 2x, with a small noise

In [10]:
print(X.shape, "2D matrix : 6 rows, 1 column")
print(y.shape, "  1D vector : 6 elements")

(6, 1) 2D matrix : 6 rows, 1 column
(6,)   1D vector : 6 elements


### dataframe

In [11]:
import pandas as pd

In [25]:
# combine X, y into single array
data = np.hstack(
    (X, y.reshape(-1, 1))   # rehape as convert y to 2d
)
print(data)

[[ 1.   2.2]
 [ 2.   3.9]
 [ 3.   6.1]
 [ 4.   7.8]
 [ 5.  10.2]
 [ 6.  11.7]]


In [21]:
y.reshape(-1, 1)

array([[ 2.2],
       [ 3.9],
       [ 6.1],
       [ 7.8],
       [10.2],
       [11.7]])

In [26]:
df = pd.DataFrame(data, columns=["X", "y"])
df.head()

Unnamed: 0,X,y
0,1.0,2.2
1,2.0,3.9
2,3.0,6.1
3,4.0,7.8
4,5.0,10.2


## Step - 1: Linear Regression

### Linear Regression model

In [40]:
from sklearn.linear_model import LinearRegression

In [41]:
model = LinearRegression()

### mse

In [51]:
from sklearn.metrics import mean_squared_error

## Step 2: Make k folds (indices only, shuffled k folds)

In [47]:
def make_kfolds_indices(n_samples, k = 3, shuffle=False, seed = 42):
    indices_arr = np.arange(n_samples)

    if shuffle:
        # random number generator
        rng = np.random.default_rng(seed)
        rng.shuffle(indices_arr)

    # split indices arr into k nearly equal chunks  (will be shuffled indices if shuffle == True, otherwise ordered chunk)
    kfolds = np.split(indices_arr , k )  # k = no of split
    
    return kfolds

In [49]:
k = 3

In [62]:
kfold_indices = make_kfolds_indices(len(y), k = k, shuffle = False)
print(kfold_indices)

[array([0, 1]), array([2, 3]), array([4, 5])]


### kfolds loop

In [94]:
X

array([[1.],
       [2.],
       [3.],
       [4.],
       [5.],
       [6.]])

In [125]:
kfolds_mse_array = []  # array to store each fold mse score

for i in range(k):


    val_indices = kfold_indices[i]
    print("Validation indices : ", val_indices)
    
    # train indices = which are not selected for validation
    train_folds = []

    for j in range(k):
        if j != i:
            train_folds.append(kfold_indices[j])

    train_indices = np.concatenate(train_folds)
    
    print("Train indices : ", train_indices)
    # print("X train: \n", X[train_indices])
    # print("y train: \n", y[train_indices])
    
    df_train = pd.DataFrame({
        "X" : X[train_indices].flatten(),
        "y" : y[train_indices]
    }, index= train_indices)

    
    df_val = pd.DataFrame({
        "X" : X[val_indices].flatten(),
        "y": y[val_indices]
    }, index = val_indices)

    print("\ndf train \n", df_train.head())
    print("\ndf val \n", df_val.head())

    # applying Linear regression on training fold
    model.fit(df_train[['X']], df_train['y'])
    
    y_pred = model.predict(df_val[['X']])
    y_actual = df_val['y']

    mse_score = mean_squared_error(y_actual, y_pred)
    print(f"MSE SCORE = {mse_score}")
    kfolds_mse_array.append( mse_score )   

    print("\n\n")


Validation indices :  [0 1]
Train indices :  [2 3 4 5]

df train 
      X     y
2  3.0   6.1
3  4.0   7.8
4  5.0  10.2
5  6.0  11.7

df val 
      X    y
0  1.0  2.2
1  2.0  3.9
MSE SCORE = 0.0316999999999996



Validation indices :  [2 3]
Train indices :  [0 1 4 5]

df train 
      X     y
0  1.0   2.2
1  2.0   3.9
4  5.0  10.2
5  6.0  11.7

df val 
      X    y
2  3.0  6.1
3  4.0  7.8
MSE SCORE = 0.018494809688581252



Validation indices :  [4 5]
Train indices :  [0 1 2 3]

df train 
      X    y
0  1.0  2.2
1  2.0  3.9
2  3.0  6.1
3  4.0  7.8

df val 
      X     y
4  5.0  10.2
5  6.0  11.7
MSE SCORE = 0.10249999999999874





In [118]:
print(kfolds_mse_array)

[0.0316999999999996, 0.018494809688581252, 0.10249999999999874]


In [121]:
avg_kfolds_mse = np.mean(kfolds_mse_array)
print(avg_kfolds_mse)

0.0508982698961932
