In [1]:
import pandas as pd
import numpy as np
from numpy import linalg as LA
import matplotlib.font_manager
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d  
import seaborn as sns  #A statistical plotting library
from sklearn.cluster import KMeans
from math import comb
from sklearn.preprocessing import PolynomialFeatures
#from kneed import KneeLocator
from mayavi import mlab
np.random.seed(42)

In [2]:
DF = pd.read_csv('function1_2d.csv')
DF.head()

Unnamed: 0.1,Unnamed: 0,x1,x2,y
0,0,-13.475524,-15.229121,140554.303333
1,1,-15.23083,-12.205868,98205.871542
2,2,5.762633,-12.758111,54090.524314
3,3,8.805995,6.38734,9342.283121
4,4,15.734299,3.268716,61518.311357


In [3]:
#Defining a class to process and divide the dataframe in to the req. numpy form.
class pro_split_df:
    def __init__(self,df,batch_size):
        self.df   = df                 #The original dataframe.
        self.batch_size = batch_size   #Number of samples to be choosen from the dataframe.
        
        
    #A function to convert dataframe in to numpy array.
    #Training data set input vectors is stored in a matrix of dim Nxd. d -> no. of features, N -> No. of examples
    def df_to_np(self,dat_fr):
        In_df = dat_fr.loc[:,['x1','x2']]    #Choosing the columns belonging to the input feature vectors.
        L_df  = dat_fr.loc[:,['y']]          #Choosing the columns belonging to the labels.
        return In_df.to_numpy(),L_df.to_numpy()
    
    #A function for Random row selection of required batch size of a Pandas dataframe
    def Rand_Choose(self):
        return self.df.sample(n = self.batch_size, random_state=42)
    
    
    def df_split(self,df):
        #Splits the array in to 70,20,10.
        return np.split(df,[int(.7*len(df)), int(.8*len(df))])
    
    #A function to split the batches in to training and data.   
    def tr_val_test_split(self):
        dat_fr = self.Rand_Choose()        #Randomly choosing batchsize number of samples from the org. dataframe.
        tr_df,ts_df,val_df = self.df_split(dat_fr) #Splits the datframe in to train, val and test.
        x_tr,y_tr = self.df_to_np(tr_df)      #conversion to numpy
        x_val,y_val = self.df_to_np(val_df)
        x_test,y_test = self.df_to_np(ts_df)
        return x_tr,x_val,x_test,y_tr,y_val,y_test
        
    

In [4]:
#Dividing the datset in to batches of 50,200 and 500.
#N=50
batch50 = pro_split_df(DF,50)  #An instace of class sep_data.
x_tr50,x_val50,x_test50,y_tr50,y_val50,y_test50 = batch50.tr_val_test_split()

#N=200
batch200 = pro_split_df(DF,200)  #An instace of class sep_data.
x_tr200,x_va200,x_test200,y_tr200,y_val200,y_test200 = batch200.tr_val_test_split()

#N=500
batch500 = pro_split_df(DF,500)  #An instace of class sep_data.
x_tr500,x_val500,x_test500,y_tr500,y_val500,y_test500 = batch500.tr_val_test_split()


In [21]:
#Defining a class that given our choice of basis function returns the optimal paramters.
#Polynomial and Gaussian Basis for the regression task at hand is employed.

class Do_Regression:
    #This class will return the optimal paramters by using the basis function of our choice.
    def __init__(self,X,y,Gb_Hp,Pb_Hp,basisname):
        self.X = X  #Input vectors of training dataset as a numpy array Nxd.
        self.y = y
        self.Gb_Hp = Gb_Hp
        self.D = None   #An hyperparameter which detrmines the spread of gaussian basis functions.
        self.s = None  #An hyperparameter which detrmines the spread of gaussian basis functions.
        self.Mean_Mat = None     #The mean vector array-(D-1)xd. Each row represents a mean vector.
        self.Pb_Hp = Pb_Hp  #List of hyper
        self.Erms = None
        self.phi = None     #Attribute used to store the design matrix used for testing
        self.pred =None     #Attribute to store the prediction values
        self.basisname = basisname
        self.hp_comb_gbf = None  #An attribute for the combination of hyperparameters.
        
           
    #KMeans implementation for finding the mean of the various basis functions initially.
    #Number of clusters eqauls the number of gaussian basis functions being used.
    def K_Clustering(self):
        #Dictionary of the arguments for scikit.KMeans
        KMeans_args = {
        "init" :"random",
        "n_init" : 10,
        "max_iter" : 300,
        "random_state" : 0,
        } 
        k = self.D - 1   #Number of clusters equals D-1.
        kmeans =  KMeans(n_clusters = k , **KMeans_args)
        kmeans.fit(self.X)
        labels = kmeans.labels_
        #The mean vector matrix is stored 
        self.Mean_Mat = kmeans.cluster_centers_
        
        
    def Gaussian_hyperparamter_comb(self):
        hp_list_gbf = list(self.Gb_Hp.values()) # a list of all possible combinations of hyperparameters
        self.hp_comb_gbf = [(d,s) for d in hp_list_gbf[0] for s in hp_list_gbf[1]]
        
 
    def Gaussian_DesMat(self):
        #Returns the output design matrix made of gbf of the input x.
        phi_gauss = np.zeros((len(self.X), self.D))
        #Finding the mean of clusters,
        self.K_Clustering()   #Calling this function will update the attribute associated with Mean Matrix.
        
        ones_vector = np.ones((len(self.X))) #To be added to the first column of the matrix
        phi_gauss[:,0] = ones_vector
        #Components of the matrix.
        c1 = np.reshape(np.einsum('ij -> i', self.X**2),(len(self.X),-1)) #A Nx1 array. N->No of examples in training
        c2 = np.einsum('ij -> i', self.Mean_Mat**2)                       #A D-1 array.
        c3 = np.einsum('ij,jk -> ik', self.X,(self.Mean_Mat.T))                           #A Nx(D-1) array.
        expo = -(c1+c2-(2*c3))/2*self.s*self.s
        phi_gauss[:,1:] = np.exp(expo)
        return phi_gauss
       
    
    def Poly_DesMat(self):
        #Returns the output design matrix made of gbf of the input x and degree m.
        d = len(x[0])
        #m is the maximum degree of monomials being used for polynomial basis function.
        poly = PolynomialFeatures(degree = self.m)  #Using the inbuilt function from sklearn.
        phi_poly = poly.fit_transform(self.X)
        
        
        #Plotting for better understanding
#         phi_flat = phi_poly.flatten()
#         xarr = np.arange(1,phi_flat.size+1,1)
#         plt.scatter(xarr,phi_flat)
#         plt.show()
        return phi_poly
    
    def get_design_mat(self):
        if self.basisname == 'Gaussian':
            PHI = self.Gaussian_DesMat()   #Updates the design matrix with gaussian basis functions.
        if self.basisname == 'Polynomial':
            PHI = self.Poly_DesMat()        #Updates the design matrix with polynomial basis functions.
#         if basisname == 'Linear':
#             PHI = self.X
        return PHI    
    
    def LinearRegressor(self,lam):  
        #Takes as input the labels of the training data and the hyperparamter lambda
        erms_tr ={}
        W = {}
        if self.basisname == 'Gaussian':
            self.Gaussian_hyperparamter_comb()  #Updates the attribute for the combination of hyperparameters.
            for (dim,sigma) in self.hp_comb_gbf:
                self.D,self.s = dim,sigma        #Updates the attribute for hyperparameters of gbf
                PHI = self.get_design_mat()
                w = np.linalg.inv(PHI.T@PHI + lam*np.identity(len(PHI.T)))@(PHI.T@self.y)
                #Error for training data
                erms_tr[str(dim)+' '+ str(sigma)] = self.erms(w) 
                W[str(dim)+ ' ' + str(sigma)] = w
        if self.basisname == 'Polynomial':
            for degree in Pb_Hp:
                self.m = degree
                PHI = self.get_design_mat()
                w = np.linalg.inv(PHI.T@PHI + lam*np.identity(len(PHI.T)))@(PHI.T@self.y)
                W['degree'] = w
        return W,erms_tr
    
    
    def erms(self,w):
        self.phi = self.get_design_mat()
        self.pred = self.phi@w
        error_arr = self.pred - self.y
        return LA.norm(error_arr)/np.sqrt(len(self.y))
    
    
    def test_set_error(self,W):  #W is the optimal parameters estimated from training
        #Run this function only after training and on testing on validation and test dataset.
        W_test = {}
        self.Gaussian_hyperparamter_comb()
        for (dim,sigma) in self.hp_comb_gbf:
            self.D,self.s = dim,sigma 
            w = W[str(dim)+' '+ str(sigma)]
            W_test[str(dim)+' '+ str(sigma)] = self.erms(w)
        return W_test

In [22]:
#Hyperparamters for gaussian and polynomial.
Gb_Hp = {'Dim': [6,9,25,35,50], 'sigma' : [0.01,0.1,1,10]}
Pb_Hp = [2,3,6,9] 
do_reg = Do_Regression(x_tr500,y_tr500,Gb_Hp,Pb_Hp,'Gaussian')  #An instance for the regression class.

In [24]:
#Optimal Parameters estimated using training data of batch 50(N=35) and lambda = 0 using gbf
W_opt,erms_tr_data = do_reg.LinearRegressor(0) 
# mus= do_reg.Mean_Mat  #Centers after clustering.
# #Plotting the cluster centers
# plt.figure(figsize =(8,6))
# plt.scatter(x_tr50[:,0],x_tr50[:,1],c='g',label='training data') #Input vector
# plt.scatter(mus[:,0],mus[:,1],c='r', label = 'cluster centers')  #Cluster centers
# plt.xlabel(r'$x1$')
# plt.ylabel(r'$x2$')
# plt.legend()
# plt.title('Training Dataset with {} cluster centers'.format(Gb_Hp['Dim']-1))
# plt.show()


In [25]:
#Error for various sigma and dimensions of the training data.

erms_tr_data

{'6 0.01': 10872.227348499611,
 '6 0.1': 14606.186808332766,
 '6 1': 36645.92907689616,
 '6 10': 37083.62187569195,
 '9 0.01': 9825.05260525228,
 '9 0.1': 9287.13112013742,
 '9 1': 36304.04763645538,
 '9 10': 37059.10802018388,
 '25 0.01': 167816.46133443833,
 '25 0.1': 2123.588206526122,
 '25 1': 31930.87472312734,
 '25 10': 11277857.476872137,
 '35 0.01': 2507060.169039422,
 '35 0.1': 388.79706998297445,
 '35 1': 25997.154195725885,
 '35 10': 126190350.62860964,
 '50 0.01': 52204.30141142701,
 '50 0.1': 256.1429635181774,
 '50 1': 22140.15496762211,
 '50 10': 11281825.607481595}

In [27]:
#Optimal Parameters after training
W_opt['6 0.01'].shape

(6, 1)

In [20]:
#Validation dataset error estimation.
Gb_Hp = {'Dim': [6,9,25,35,50], 'sigma' : [0.01,0.1,1,10]}
val_reg = Do_Regression(x_val500,y_val500,Gb_Hp,Pb_Hp,'Gaussian')
erms_val_data = val_reg.test_set_error(W_opt)
# erms_val_df = pd.DataFrame(list(erms_val_data.items()),columns = ['Degree and Sigma','Erms'])
erms_val_data

{'6 0.01': 2150559.3759162137,
 '6 0.1': 48372.71645765625,
 '6 1': 43890.581322829545,
 '6 10': 58153.547018613586,
 '9 0.01': 363543185.40609735,
 '9 0.1': 58187.318603779364,
 '9 1': 43061.33583203358,
 '9 10': 12545422256157.57,
 '25 0.01': 3051216504.327322,
 '25 0.1': 1817241.6859147162,
 '25 1': 54833.57768594001,
 '25 10': 4.261456604327142e+33,
 '35 0.01': 3760321791.278151,
 '35 0.1': 6818336.303514117,
 '35 1': 66733.39404213506,
 '35 10': 3.959192142244023e+35,
 '50 0.01': 3515235806.9307137,
 '50 0.1': 308448688.22739655,
 '50 1': 64646.70270686437,
 '50 10': 5.240631818694538e+47}

In [11]:
# #Predictions on the training dataset using in-built functions.
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# from sklearn.base import BaseEstimator, TransformerMixin





# #Polynomial basis Regression
# poly_model = make_pipeline(PolynomialFeatures(6),LinearRegression())
# pred=poly_model.fit(x_tr50,y_tr50).predict(x_tr50)
# erms = erms_model(pred,y_tr50)





In [12]:
%%latex
As can be seen from the above plots, varying the values of sigma greatly varies the points. If many points are
reduced then this might lead to an overfitting even in the case of 3 dimensions(3 parameters) problem as 
the number of parameters available will match the number of points.

<IPython.core.display.Latex object>