In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from statsmodels.nonparametric.smoothers_lowess import lowess

Read data

In [2]:
data = pd.read_csv("hw1.EX2.csv")

In [3]:
data.head()

Unnamed: 0,X,Y
0,84.87882,37.9
1,306.5947,42.2
2,561.9845,47.3
3,561.9845,54.8
4,390.5684,43.1


In [4]:
X=data["X"]
y=data["Y"]

Split 

In [6]:
X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size=0.25, random_state=42)

Linear regression and polynoms search for best degree

In [12]:
best_degree = None
best_mse = np.inf
best_poly_model = None

for degree in range(1, 21):
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    model.fit(np.array(X_train).reshape(-1,1), y_train)
    y_pred = model.predict(np.array(X_dev).reshape(-1,1))
    mse = mean_squared_error(y_dev, y_pred)
    
    if mse < best_mse:
        best_mse = mse
        best_degree = degree
        best_poly_model = model

print(f"Best polynomial degree: {best_degree}")
print(f"Best mse: {best_mse}")
print(f"Best best_model: {best_poly_model}")

Best polynomial degree: 4
Best mse: 63.1960520531872
Best best_model: Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=4)),
                ('linearregression', LinearRegression())])


In [14]:
mean_squared_error(y_train, best_poly_model.predict(np.array(X_train).reshape(-1,1))) 


92.70965985046952

Normalization

In [17]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(np.array(X_train).reshape(-1,1))
X_dev_std = scaler.transform(np.array(X_dev).reshape(-1,1))

Add regularization of 3 possible 

In [None]:
regularization_models = [LassoCV, RidgeCV, ElasticNetCV]
best_reg_model = None
best_reg_mse = np.inf
best_reg_model_name = ""

for reg_model in regularization_models:
    model = make_pipeline(PolynomialFeatures(5), reg_model())
    model.fit(X_train_std, y_train)
    y_pred = model.predict(X_dev_std)
    mse = mean_squared_error(y_dev, y_pred)
    
    if mse < best_reg_mse:
        best_reg_mse = mse
        best_reg_model = model
        best_reg_model_name = reg_model.__name__

In [21]:
best_reg_model.named_steps[best_reg_model_name.lower()].alpha_

10.0

In [20]:
best_reg_model_name

'RidgeCV'

Lowess model from package stat

In [25]:
best_frac = None
best_lowess_mse = np.inf
best_lowess = None

for frac in np.linspace(0.05, 1, 20):
    lowess_model = lowess(y_train, X_train, frac=frac)
    lowess_pred = np.interp(X_dev, lowess_model[:, 0], lowess_model[:, 1])
    mse = mean_squared_error(y_dev, lowess_pred)

    if mse < best_lowess_mse:
        best_lowess_mse = mse
        best_frac = frac
        best_lowess = lowess_model

In [26]:
best_frac

0.15

Last train of full set and test on test

In [28]:
X_train_dev_std = scaler.fit_transform(np.array(X_train_dev).reshape(-1,1))
X_test_std = scaler.transform(np.array(X_test).reshape(-1,1))

In [31]:
best_poly_model.fit(np.array(X_train_dev).reshape(-1,1), y_train_dev)
best_reg_model.fit(np.array(X_train_dev).reshape(-1,1), y_train_dev)
best_lowess_full = lowess(y_train_dev, X_train_dev, frac=best_frac)

In [37]:
y_test_pred_poly = best_poly_model.predict(np.array(X_test).reshape(-1,1))
y_test_pred_reg = best_reg_model.predict(np.array(X_test).reshape(-1,1))
y_test_pred_lowess = np.interp(X_test, best_lowess_full[:, 0], best_lowess_full[:, 1])

In [38]:
mse_test_poly = mean_squared_error(y_test, y_test_pred_poly)
mse_test_reg = mean_squared_error(y_test, y_test_pred_reg)
mse_test_lowess = mean_squared_error(y_test, y_test_pred_lowess)

In [34]:
mse_test_poly

59.67511516428468

In [36]:
mse_test_lowess

50.02827183326719

Lowess is the best