In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [82]:
relevant_cols = ['YearStart','YearEnd','Data_Value','Low_Confidence_Limit','High_Confidence_Limit','Sample_Size']

df = pd.read_csv('Nutrition.csv')
df = df[relevant_cols]
df = df.dropna()

df.head()

Unnamed: 0,YearStart,YearEnd,Data_Value,Low_Confidence_Limit,High_Confidence_Limit,Sample_Size
0,2011,2011,32.0,30.5,33.5,7304.0
1,2011,2011,32.3,29.9,34.7,2581.0
2,2011,2011,31.8,30.0,33.6,4723.0
3,2011,2011,33.6,29.9,37.6,1153.0
4,2011,2011,32.8,30.2,35.6,2402.0


In [83]:
from sklearn.preprocessing import StandardScaler

Y = df['Data_Value']
X = df.drop(['Data_Value'], axis=1)

scaler = StandardScaler()

X = scaler.fit_transform(X)

#### 🎢 Ridge Regression with hyperparameter tuning

In [84]:
from sklearn.linear_model import Ridge

# Learning Rate as hyperparameter
params = [{'alpha': [0.01, 0.1, 0.5, 1, 5, 10, 50, 100]}]

grid = GridSearchCV(Ridge(), params, cv=5, scoring='r2')
grid.fit(X, Y)

results = []

for param in grid.cv_results_['params']:
    ridge = Ridge(alpha=param['alpha'])
    ridge.fit(X, Y)

    Y_pred = ridge.predict(X)

    # Metrics
    mse = mean_squared_error(Y, Y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(Y, Y_pred)
    r2 = r2_score(Y, Y_pred)
    r2_adj = 1 - (1 - r2) * (len(Y) - 1) / (len(Y) - X.shape[1] - 1)

    results.append({
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'r2_adj': r2_adj
    })

result_df = pd.DataFrame(results)
result_df

Unnamed: 0,mse,rmse,mae,r2,r2_adj
0,0.127853,0.357565,0.189465,0.998782,0.998782
1,0.127853,0.357565,0.189461,0.998782,0.998782
2,0.127853,0.357565,0.189444,0.998782,0.998782
3,0.127853,0.357565,0.189422,0.998782,0.998782
4,0.127853,0.357565,0.189252,0.998782,0.998782
5,0.127854,0.357567,0.189039,0.998782,0.998782
6,0.127889,0.357615,0.187363,0.998782,0.998782
7,0.127997,0.357767,0.185336,0.998781,0.998781


### 🏇 Lasso Regression

In [85]:
from sklearn.linear_model import Lasso

# Learning Rate as hyperparameter
params = [{'alpha': [0.01, 0.1, 0.5, 1, 5, 10, 50, 100]}]

grid = GridSearchCV(Lasso(), params, cv=5, scoring='r2')
grid.fit(X, Y)

results = []

for param in grid.cv_results_['params']:
    lasso = Lasso(alpha=param['alpha'])
    lasso.fit(X, Y)

    Y_pred = lasso.predict(X)

    # Metrics
    mse = mean_squared_error(Y, Y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(Y, Y_pred)
    r2 = r2_score(Y, Y_pred)
    r2_adj = 1 - (1 - r2) * (len(Y) - 1) / (len(Y) - X.shape[1] - 1)

    results.append({
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'r2_adj': r2_adj
    })

result_df = pd.DataFrame(results)
result_df

Unnamed: 0,mse,rmse,mae,r2,r2_adj
0,0.128103,0.357914,0.186526,0.99878,0.99878
1,0.139721,0.373792,0.171366,0.998669,0.998669
2,0.405515,0.636801,0.42816,0.996138,0.996138
3,1.23647,1.111967,0.833293,0.988224,0.988223
4,27.831921,5.275597,4.120034,0.734933,0.734905
5,104.999513,10.246927,8.009558,0.0,-0.000103
6,104.999513,10.246927,8.009558,0.0,-0.000103
7,104.999513,10.246927,8.009558,0.0,-0.000103


### 🍊 Linear Regression

In [86]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test =  train_test_split(X, Y, test_size=0.3, random_state=4)

reg = LinearRegression()
reg.fit(x_train, y_train)

y_pred = reg.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
r2_adj = 1 - (1 - r2) * (len(y_test) - 1) / (len(y_test) - x_test.shape[1] - 1)
rmse = np.sqrt(mse)

result_df = pd.DataFrame([{
    'mse': mse,
    'rmse': rmse,
    'mae': mae,
    'r2': r2,
    'r2_adj': r2_adj
}]);
result_df

Unnamed: 0,mse,rmse,mae,r2,r2_adj
0,0.133454,0.365313,0.190558,0.998735,0.998735
