In [None]:
# Import Libraries

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import scipy.stats as stats 
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy.interpolate import interp1d, RegularGridInterpolator, griddata, LinearNDInterpolator, NearestNDInterpolator
from math import ceil
from scipy import linalg
from sklearn.linear_model import LinearRegression as LR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [None]:
# get data

cars_data = pd.read_csv('/Users/rebeccawagner/Documents/GitHub/Data 441/Data/cars.csv')
concrete_data = pd.read_csv('/Users/rebeccawagner/Documents/GitHub/Data 441/Data/concrete.csv')
housing_data = pd.read_csv('/Users/rebeccawagner/Documents/GitHub/Data 441/Data/housing.csv')

cars_x = cars_data[['WGT','CYL','ENG']]
cars_y = cars_data['MPG'].values

concrete_x = concrete_data[['cement','slag','ash','water','superplastic','coarseagg','fineagg','age']]
concrete_y = concrete_data['strength']

housing_x = [['crime','residential','industrial','river','nox','rooms','older','distance','highway','tax','ptratio','lstat']]
housing_y = housing_data[['cmedv']]

1. Implement the Gradient Boosting algorithm with user defined choices for Regressor_1 and Regressor_2


In [None]:
def gradient_boosting(xtrain, ytrain, xtest, ytest, regressor_1, regressor_2):

    mode1_1 = regressor_1
    model_1.fit(xtrain,ytrain) # fit regressor 1 on the data
    y_pred = model_1.predict(xtrain)
    residuals = ytrain - y_pred # caclualte residuals of regressor 1
    
    model_2 = regressor_2.fit(xtrain,residuals) # fit regressor 2 on the data and residuals
    y_pred = regressor_1.predict(xtrain) + regressor_2.predict(xtrain) 
    residuals = ytrain - y_pred

    y_pred = regressor_1.predict(xtest) + regressor_2.predict(xtest)
    return mse(y_pred, ytest)

In [None]:
xtrain, xtest, ytrain, ytest = tts(cars_x, cars_y, test_size = .3)

In [None]:
gradient_boosting(xtrain, ytrain, xtest, ytest, LR(), DTR())

2. Test the Boosted Locally Weighted Regressor with different choices of data (such as "cars.csv", "concrete.csv" and "housing.csv") and different choice of kernels, such as Gaussian, Tricubic, Epanechnikov and Quartic.

In [None]:
# Kernel Choices

# Gaussian Kernel
def Gaussian(x):
  if len(x.shape)==1:
    d = np.abs(x)
  else:
    d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*d**2))

# Tricubic Kernel
def Tricubic(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,70/81*(1-d**3)**3)

# Quartic Kernel
def Quartic(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,15/16*(1-d**2)**2)

# Epanechnikov Kernel
def Epanechnikov(x):
  if len(x.shape) == 1:
    x = x.reshape(-1,1)
  d = np.sqrt(np.sum(x**2,axis=1))
  return np.where(d>1,0,3/4*(1-d**2)) 

In [None]:
# function that computes the Euclidean distance between all the observation in u and v
def dist(u,v):
  if len(v.shape)==1: # force v into column vector
    v = v.reshape(1,-1)
  d = np.array([np.sqrt(np.sum((u-v[i])**2,axis=1)) for i in range(len(v))]) # distance between all points in u and v
  return d

In [None]:
# definition of multidimensional LOWES

def lw_ag_md(x, y, xnew,f=2/3,iter=3, intercept=True):

  n = len(x)
  r = int(ceil(f * n))
  yest = np.zeros(n)

  if len(y.shape)==1: # here we make column vectors
    y = y.reshape(-1,1)

  if len(x.shape)==1:
    x = x.reshape(-1,1)
  
  if intercept:
    x = np.column_stack([np.ones((len(x),1)),x])

  h = [np.sort(np.sqrt(np.sum((x-x[i])**2,axis=1)))[r] for i in range(n)]
  # dist(x,x) is always symmetric
  w = np.clip(dist(x,x) / np.array(h), 0.0, 1.0)
  w = (1 - w ** 3) ** 3

  #Looping through all X-points
  delta = np.ones(n)
  for iteration in range(iter):
    for i in range(n):
      W = np.diag(delta).dot(np.diag(w[i,:]))
      # when we multiply two diagonal matrices we get also a diagonal matrix
      b = np.transpose(x1).dot(W).dot(y)
      A = np.transpose(x1).dot(W).dot(x1)
      ##
      A = A + 0.0001*np.eye(x1.shape[1]) # if we want L2 regularization for solving the system
      beta = linalg.solve(A, b)

      #beta, res, rnk, s = linalg.lstsq(A, b)
      yest[i] = np.dot(x1[i],beta.ravel())

    residuals = y.ravel() - yest
    s = np.median(np.abs(residuals))

    delta = np.clip(residuals / (6.0 * s), -1, 1)

    delta = (1 - delta ** 2) ** 2
    
  # here we are making predictions for xnew by using an interpolation and the predictions we made for the train data
  if x.shape[1]==1:
    f = interp1d(x.flatten(),yest,fill_value='extrapolate')
    output = f(xnew)
  else:
    output = np.zeros(len(xnew))
    for i in range(len(xnew)):
      ind = np.argsort(np.sqrt(np.sum((x-xnew[i])**2,axis=1)))[:r]
      pca = PCA(n_components=3)
      x_pca = pca.fit_transform(x[ind])
      tri = Delaunay(x_pca,qhull_options='QJ Pp')
      f = LinearNDInterpolator(tri,yest[ind])
      output[i] = f(pca.transform(xnew[i].reshape(1,-1))) 
      # the output may have NaN's where the data points from xnew are outside the convex hull of X

  if sum(np.isnan(output))>0:
    g = NearestNDInterpolator(x,yest.ravel()) 
    # output[np.isnan(output)] = g(X[np.isnan(output)])
    output[np.isnan(output)] = g(xnew[np.isnan(output)])
  return output

In [None]:
class Lowess_AG_MD:
    def __init__(self, f = 1/10, iter = 3,intercept=True):
        self.f = f
        self.iter = iter
        self.intercept = intercept
    
    def fit(self, x, y):
        f = self.f
        iter = self.iter
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        f = self.f
        iter = self.iter
        intercept = self.intercept
        return lw_ag_md(x, y, x_new, f, iter, intercept) # this is actually our defined function of Lowess

    def get_params(self, deep=True):
    # suppose this estimator has parameters "f", "iter" and "intercept"
        return {"f": self.f, "iter": self.iter,"intercept":self.intercept}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [None]:
xtrain, xtest, ytrain, ytest = tts(cars_x, cars_y, test_size = .3)

gradient_boosting(xtrain, ytrain, xtest, ytest, Lowess_AG_MD(), DTR())

In [None]:
model1 = Lowess_AG_MD()

In [None]:
model1.fit(xtrain,ytrain)