<font face="Chalkboard" color="darkgreen" size=6pt> LOWESS - Multidimensional Features and Real Data Applications</font>

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120

In [None]:
import numpy as np
import pandas as pd
from math import ceil
from scipy import linalg
from scipy.interpolate import interp1d
from scipy.linalg import lstsq
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
from sklearn.model_selection import train_test_split as tts, KFold
from sklearn.metrics import mean_squared_error as mse
from scipy.interpolate import interp1d, griddata, LinearNDInterpolator, NearestNDInterpolator
import statsmodels.api as sm
from math import ceil
from IPython.display import Image
from IPython.display import display
plt.style.use('seaborn-white')

# the following line(s) are necessary if you want to make SKlearn compliant functions
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

## Local Inferences

Locally, in neighborhoods defined by using the weights, we can make predictions by using regularized linear regression. 

In [None]:
#lm = LinearRegression()
lm = Ridge(alpha=0.001)
# lm = ElasticNet(alpha=0.01,max_iter=1e7)
scale = StandardScaler()

## Kernel Definitions


In [None]:
# Gaussian Kernel
def Gaussian(x):
  if len(x.shape)==1:
    d = np.abs(x)
  else:
    d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*d**2))

In [None]:
# Tricubic Kernel
def Tricubic(x):
  if len(x.shape)==1:
    d = np.abs(x)
  else:
    d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,70/81*(1-d**3)**3)

In [None]:
# Epanechnikov Kernel
def Epanechnikov(x):
  if len(x.shape)==1:
    d = np.abs(x)
  else:
    d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,3/4*(1-d**2)) 

In [None]:
# Quartic Kernel
def Quartic(x):
  if len(x.shape)==1:
    d = np.abs(x)
  else:
    d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,15/16*(1-d**2)**2) 

## Useful Functions

Big Idea: we need to acommodate new data points in a test set. We can only get weights from the train set.

In [None]:
def kernel_function(xi,x0,kern, tau): 
    return kern((xi - x0)/(2*tau))

In [None]:
def weights_matrix(x,x_new,kern,tau):
  if np.isscalar(x_new):
    return kernel_function(x,x_new,kern,tau)
  else:
    n = len(x_new)
    return np.array([kernel_function(x,x_new[i],kern,tau) for i in range(n)])

## Scikit-Learn Compliant Functions

Main Idea: we want to define a model regressor that can be used as model.fit/model.predict, and that also allows sklearn GridSearchCV for tuning hyperparameters.

*Self* represents the instance of the class. By using the “self”  we can access the attributes and methods of the class in python. It binds the attributes with the given arguments.

In [None]:
class Lowess:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau
    
    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_

        w = weights_matrix(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
          lm.fit(np.diag(w).dot(x.reshape(-1,1)),np.diag(w).dot(y.reshape(-1,1)))
          yest = lm.predict([[x_new]])[0][0]
        elif len(x.shape)==1:
          n = len(x_new)
          yest_test = np.zeros(n)
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[i,:]).dot(x.reshape(-1,1)),np.diag(w[i,:]).dot(y.reshape(-1,1)))
            yest_test[i] = lm.predict(x_new[i].reshape(-1,1))
        else:
          n = len(x_new)
          yest_test = np.zeros(n)
          #Looping through all x-points
          for i in range(n):
            lm.fit(np.diag(w[i,:]).dot(x),np.diag(w[i,:]).dot(y.reshape(-1,1)))
            yest_test[i] = lm.predict(x_new[i].reshape(1,-1))
        return yest_test

## Other Variants of Locally Weighted Regression

In [None]:
def lw_reg(X, y, xnew, kernel, tau, intercept):
    # tau is called bandwidth K((x-x[i])/(2*tau))
    n = len(X) # the number of observations
    yest = np.zeros(n)

    if len(y.shape)==1: # here we make column vectors
      y = y.reshape(-1,1)

    if len(X.shape)==1:
      X = X.reshape(-1,1)
    
    if intercept:
      X1 = np.column_stack([np.ones((len(X),1)),X])
    else:
      X1 = X

    w = np.array([kernel((X - X[i])/(2*tau)) for i in range(n)])

    #Looping through all X-points
    for i in range(n):          
        W = np.diag(w[:,i])
        b = np.transpose(X1).dot(W).dot(y)
        A = np.transpose(X1).dot(W).dot(X1)
        A = A + 0.001*np.eye(X1.shape[1]) # if we want L2 regularization
        #theta = linalg.solve(A, b) # A*theta = b
        beta, res, rnk, s = lstsq(A, b)
        yest[i] = np.dot(X1[i],beta)
    if X.shape[1]==1:
      f = interp1d(X.flatten(),yest,fill_value='extrapolate')
    else:
      f = LinearNDInterpolator(X, yest)
    output = f(xnew) # the output may have NaN's where the data points from xnew are outside the convex hull of X
    if sum(np.isnan(output))>0:
      g = NearestNDInterpolator(X,y.ravel()) 
      # output[np.isnan(output)] = g(X[np.isnan(output)])
      output[np.isnan(output)] = g(xnew[np.isnan(output)])
    return output

In [None]:
class Lowess_interp:
    def __init__(self, kernel = Gaussian, tau=0.05,intercept=True):
        self.kernel = kernel
        self.tau = tau
        self.intercept = intercept
    
    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        kern = self.kernel
        tau = self.tau
        intercept = self.intercept
        return lw_reg(x, y, x_new, kern, tau, intercept)

In [None]:
Lowess_interp.get_params

AttributeError: ignored

## Validation Functions

In [None]:
def validation_function(x,y,model):
  kf = KFold(n_splits=10,shuffle=True,random_state=123)
  mse_test_lowess = []
  for idxtrain, idxtest in kf.split(x):
    xtrain = scale.fit_transform(x[idxtrain])
    xtest = scale.transform(x[idxtest])
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    # for our 1-dimensional input data we do not need scaling
    model.fit(xtrain,ytrain)
    mse_test_lowess.append(mse(ytest,model.predict(xtest)))
  return np.mean(mse_test_lowess)

##<font face='menlo' size=6pt> Real Data Applications</font>

###1. "Cars" dataset

In [None]:
data = pd.read_csv('drive/MyDrive/Data Sets/cars.csv')

In [None]:
data

Unnamed: 0,MPG,CYL,ENG,WGT
0,18.0,8,307.0,3504
1,15.0,8,350.0,3693
2,18.0,8,318.0,3436
3,16.0,8,304.0,3433
4,17.0,8,302.0,3449
...,...,...,...,...
387,27.0,4,140.0,2790
388,44.0,4,97.0,2130
389,32.0,4,135.0,2295
390,28.0,4,120.0,2625


In [None]:
x = data.loc[:,'CYL':'WGT'].values
y = data['MPG'].values

## Quick Testing if Functions Work (one Train/Test split)

In [None]:
xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.25,random_state=123)

In [None]:
model = Lowess_interp(tau=0.18,kernel=Gaussian,intercept=True)
model.fit(scale.fit_transform(xtrain),ytrain)
mse(ytest,model.predict(scale.transform(xtest)))

18.963826841449798

In [None]:
model = Lowess(tau=0.18,kernel=Gaussian)
model.fit(scale.fit_transform(xtrain),ytrain)
mse(ytest,model.predict(scale.transform(xtest)))

19.613929192498134

## K-Fold Cross-validations

In [None]:
model = Lowess_interp(tau=1.0,kernel=Epanechnikov,intercept=True)
validation_function(x,y,model)

17.313488463407424

In [None]:
model = Lowess(tau=0.18,kernel=Gaussian)
validation_function(x,y,model)

17.719292615426422

In [None]:
model = RandomForestRegressor(n_estimators=100,max_depth=3)
model.fit(x,y)
mse(y,model.predict(x))

13.907730990686453

###2. "Concrete" dataset

In [None]:
data = pd.read_csv('drive/MyDrive/Data Sets/concrete.csv')

In [None]:
x = data.loc[:,'cement':'age'].values
y = data['strength'].values

In [None]:
model = Lowess_interp(tau=1.0,kernel=Epanechnikov,intercept=True)
validation_function(x,y,model)