In [1]:
from sklearn.metrics.pairwise import pairwise_distances
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold

from model import *

In [2]:
# Read each of the multi var datasets
sets = []
for i in range(1,5):
    if i <= 2:
        data = pd.read_csv(f"http://www.cs.iit.edu/~agam/cs584/share/mvar-set{i}.dat",sep=' ',skiprows=5, names=['x1','x2','y']).reset_index(drop=True)
    else:
        data = data = pd.read_csv(f"http://www.cs.iit.edu/~agam/cs584/share/mvar-set{i}.dat",sep=' ',skiprows=5, names=['x1','x2','x3','x4','x5','y']).reset_index(drop=True)
    sets.append(data)

In [3]:
# Map features to higher dimensional space using combinations of features and different degrees
# Perform linear regression on the mapping and KFold cross validation to choose that right mapping for each dataset
kf = KFold(n_splits=5)
degrees = [2,3,4,5]
best_models = []
print("Performing 5-fold cross validation across 4 different polynomial mappings:")
for i, data in enumerate(sets):
    # Take all columns except the last one
    X = data.iloc[:,:-1].values
    y = data.y.values
    average_test_errors = []
    average_train_errors = []
    for degree in degrees:
        train_errors = []
        test_errors = []
        # Start KFold for current degree of polynomial
        for train_index, test_index in kf.split(X):
            # Split to train and test for this fold
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            # Create polynomial features
            poly_reg = PolynomialFeatures(degree=degree)
            X_train_poly = poly_reg.fit_transform(X_train)
            X_test_poly = poly_reg.fit_transform(X_test)
            
            # Train LR
            lr = LinearRegressionExplicit()
            lr.fit(X_train_poly,y_train)
            
            # Predict and error
            train_pred = lr.predict(X_train_poly)
            test_pred = lr.predict(X_test_poly)
            
            # Save the MSE of this fold
            train_errors.append(regression_error(train_pred,y_train))
            test_errors.append(regression_error(test_pred,y_test))
        
        # Average the MSE across 5 fold for each mapping
        average_test_errors.append(np.average(test_errors))
        average_train_errors.append(np.average(train_errors))
        
    print(f"Dataset {i+1}")
    for i, (average_train_error, average_test_error) in enumerate(zip(average_train_errors,average_test_errors)):
        print(f"\tDegree {degrees[i]}:")
        print(f"\t\tAverage training MSE: {average_train_error}")
        print(f"\t\tAverage testing MSE: {average_test_error}")
    
    print(f"\tPolynomial of degree {degrees[np.argmin(average_test_errors)]} gives smallest test error\n")

Performing 5-fold cross validation across 4 different polynomial mappings:
Dataset 1
	Degree 2:
		Average training MSE: 0.25808761488858145
		Average testing MSE: 0.2597781514068387
	Degree 3:
		Average training MSE: 0.25729910888102253
		Average testing MSE: 0.26088043683043083
	Degree 4:
		Average training MSE: 0.25606766217360927
		Average testing MSE: 0.2603542361948251
	Degree 5:
		Average training MSE: 0.25563148328271373
		Average testing MSE: 0.2617786043747527
	Polynomial of degree 2 gives smallest test error

Dataset 2
	Degree 2:
		Average training MSE: 0.019900958935770385
		Average testing MSE: 0.019986308503465566
	Degree 3:
		Average training MSE: 0.01029385848336197
		Average testing MSE: 0.010392009061652323
	Degree 4:
		Average training MSE: 0.010282023257025765
		Average testing MSE: 0.010409804351559427
	Degree 5:
		Average training MSE: 0.00466532923572765
		Average testing MSE: 0.004788226767267469
	Polynomial of degree 5 gives smallest test error

Dataset 3
	Degre

In [4]:
# Choosing mapping of degree 2, solve the regression problem using explicit solution vs iterative

for i, data in enumerate(sets):
    # Take all columns except the last one
    X = data.iloc[:,:-1].values
    y = data.y.values
    train_errors_explicit = []
    test_errors_explicit = []
    train_errors_iterative = []
    test_errors_iterative = []
    
    # Start KFold
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Create polynomial features
        poly_reg = PolynomialFeatures(degree=2)
        X_train_poly = poly_reg.fit_transform(X_train)
        X_test_poly = poly_reg.fit_transform(X_test)

        # LR explicit
        # Train
        lr_explicit = LinearRegressionExplicit()
        lr_explicit.fit(X_train_poly,y_train)
        
        # Predict
        train_pred = lr_explicit.predict(X_train_poly)
        test_pred = lr_explicit.predict(X_test_poly)
        
        # MSE of fold
        train_errors_explicit.append(regression_error(train_pred,y_train))
        train_errors_explicit.append(regression_error(test_pred,y_test))
        
        # LR iterative
        # Train
        if i <= 1:
            learning_rate = 1e-4
        else:
            learning_rate = 1e-6
        lr_iterative = LinearRegressionIterative()
        lr_iterative.fit(X_train_poly,y_train,10,learning_rate=learning_rate,verbose=0)
        
        # Predict
        train_pred = lr_iterative.predict(X_train_poly)
        test_pred = lr_iterative.predict(X_test_poly)
        
        # MSE of fold
        train_errors_iterative.append(regression_error(train_pred,y_train))
        test_errors_iterative.append(regression_error(test_pred,y_test))
    
    print(f"Dataset {i+1}:")
    print(f"\tIterative:")
    print(f"\t\tAverage train MSE: {np.average(train_errors_iterative)}")
    print(f"\t\tAverage test MSE: {np.average(test_errors_iterative)}")
    print(f"\tExplicit:")
    print(f"\t\tAverage train MSE: {np.average(train_errors_explicit)}")
    print(f"\t\tAverage test MSE: {np.average(train_errors_explicit)}")

Dataset 1:
	Iterative:
		Average train MSE: 0.2903976393701412
		Average test MSE: 0.292034172422746
	Explicit:
		Average train MSE: 0.25893288314771007
		Average test MSE: 0.25893288314771007
Dataset 2:
	Iterative:
		Average train MSE: 0.01991017797793271
		Average test MSE: 0.019979081072201088
	Explicit:
		Average train MSE: 0.019943633719617975
		Average test MSE: 0.019943633719617975
Dataset 3:
	Iterative:
		Average train MSE: 0.9897733390127728
		Average test MSE: 0.9904899848411304
	Explicit:
		Average train MSE: 0.25078420598776296
		Average test MSE: 0.25078420598776296
Dataset 4:
	Iterative:
		Average train MSE: 0.004144630842162844
		Average test MSE: 0.004145749526527948
	Explicit:
		Average train MSE: 0.0038875023603957394
		Average test MSE: 0.0038875023603957394
