In [49]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

def generate_data():
    np.random.seed(33)
    x = np.random.uniform(0, 1, 10)
    x.sort()
    noise = np.random.normal(0, np.sqrt(0.25), 10)
    y = x**2 + 0.1 * x + noise
    return x, y

def trig_func(x, m):
    Xtri = np.column_stack([np.cos(2 * np.pi * i * x) for i in range(1, m + 1)])
    Xtri = np.column_stack([np.ones(x.shape), Xtri])
    return Xtri

def poly_func(x, m):
    Xpoly = np.column_stack([x**i for i in range(m + 1)])
    Xpoly = np.column_stack([np.ones(x.shape), Xpoly])
    return Xpoly

def schwarz_criterion(DoF, n, Remp):
    p = DoF / n
    return (1 + p * ((1-p) ** -1) * np.log(n)) * Remp

def calculate_nrms(y_true, y_pred, normalization="mean"):
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    if normalization == "range":
        scale = np.max(y_true) - np.min(y_true)
    elif normalization == "mean":
        scale = np.mean(y_true)
    elif normalization == "std":
        scale = np.std(y_true)
    else:
        raise ValueError("Unknown normalization method")
    nrms = rmse / scale
    return nrms


def train(x, y, func):
    kf = KFold(n_splits=5)
  
    folds = range(0,5)
    opt_ms = []
    pred_accs = []
    nrms = []

    for j, (train_index, test_index) in enumerate(kf.split(x)):
        X_train, X_test = x[train_index], y[test_index]
        y_train, y_test = y[train_index], y[test_index]
        complexities = range(1, 7)

        Est_r = []
        emps = []

        for m in complexities:
            model = LinearRegression()
            model.fit(func(X_train, m), y_train)
            y_pred = model.predict(func(X_train, m))
            emp =  np.mean((y_train - y_pred) ** 2)
            emps.append(emp)
            Est_r.append(schwarz_criterion(m+1, sample_size, emp))
        
        Est_r_formatted = ', '.join([f"{value:.6f}" for value in Est_r])
        emps_formatted = ', '.join([f"{value:.6f}" for value in emps])
        print(f"Fold {j+1}: Estimated Errors (Rs)  -> {Est_r_formatted}")
        print(f"Fold {j+1}: Mse  -> {emps_formatted}")
    
        opt_m_index = np.argmin(Est_r)
        opt_m = opt_m_index + 1
        print(f"opt m = {opt_m}")
        opt_ms.append(opt_m)

        opt_model = LinearRegression()
        opt_model.fit(func(X_train, opt_m ), y_train)
        y_pred_opt = opt_model.predict(func(X_test, opt_m))
        mse = np.mean((y_test - y_pred_opt) ** 2)
        nrm = calculate_nrms(y_test, y_pred_opt)
        nrms.append(nrm)
        pred_accs.append(mse)
        # print(mse)
        
    pd.set_option('display.float_format', '{:.6f}'.format)
    results_df = pd.DataFrame({'fold': folds,
                                'optimal m': opt_ms,
                                'Prediction accuracy(MSE)': pred_accs,
                                'NRMS': nrms})
    print(np.mean(pred_accs))
    
    print(results_df)
            
if __name__ == "__main__":
    sample_size = 8

    x, y = generate_data()
    train(x, y, trig_func)
    print("\n\n\n")
    train(x, y, poly_func)

Fold 1: Estimated Errors (Rs)  -> 0.474718, 0.238205, 0.316756, 0.451414, 0.341583, 0.578385
Fold 1: Mse  -> 0.280376, 0.105979, 0.102862, 0.101084, 0.047191, 0.037181
opt m = 2
Fold 2: Estimated Errors (Rs)  -> 0.496962, 0.554809, 0.705598, 0.892777, 1.398401, 0.044841
Fold 2: Mse  -> 0.293514, 0.246838, 0.229132, 0.199917, 0.193194, 0.002883
opt m = 6
Fold 3: Estimated Errors (Rs)  -> 0.292626, 0.350852, 0.433446, 0.628226, 0.912884, 0.340425
Fold 3: Mse  -> 0.172830, 0.156096, 0.140755, 0.140677, 0.126118, 0.021884
opt m = 1
Fold 4: Estimated Errors (Rs)  -> 0.411378, 0.368631, 0.441462, 0.634569, 0.937692, 0.650529
Fold 4: Mse  -> 0.242967, 0.164006, 0.143358, 0.142097, 0.129546, 0.041818
opt m = 2
Fold 5: Estimated Errors (Rs)  -> 0.237259, 0.284802, 0.294948, 0.373933, 0.338964, 0.505308
Fold 5: Mse  -> 0.140129, 0.126710, 0.095780, 0.083734, 0.046829, 0.032483
opt m = 1
2.3181972769483083
   fold  optimal m  Prediction accuracy(MSE)       NRMS
0     0          2                 