In [1]:
import numpy as np
import matplotlib 
matplotlib.use('nbagg')
import matplotlib.pyplot as plt
import random
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [2]:
# Importing Training and Test datasets
train_data = pd.read_csv('training_data.txt', sep="\t")
test_data = pd.read_csv('testing_data.txt', sep="\t")

In [3]:
# Taking the non-linear transformation for training and testing datasets
train_data['x1_sq'] = train_data['x1']**2
train_data['x2_sq'] = train_data['x2']**2
train_data['x1_x2'] = train_data['x1']*train_data['x2']
train_data['mod_diff'] = np.absolute(train_data['x1'] - train_data['x2'])
train_data['mod_sum'] = np.absolute(train_data['x1'] + train_data['x2'])
test_data['x1_sq'] = test_data['x1']**2
test_data['x2_sq'] = test_data['x2']**2
test_data['x1_x2'] = test_data['x1']*test_data['x2']
test_data['mod_diff'] = np.absolute(test_data['x1'] - test_data['x2'])
test_data['mod_sum'] = np.absolute(test_data['x1'] + test_data['x2'])

In [4]:
# Separating the independent and dependent variables and splitting the datasets into training, testing and 
# validation sets
X_train = train_data.loc[0:24, train_data.columns != 'y']
y_train = train_data.loc[0:24, train_data.columns == 'y']
X_val = train_data.loc[25: , train_data.columns != 'y']
y_val = train_data.loc[25: , train_data.columns == 'y']
X_test = test_data.loc[:, test_data.columns != 'y']
y_test = test_data.loc[:, test_data.columns == 'y']

In [16]:
# Inserting a column of ones in the train, test and validation datasets
X_train.insert(0,"ones",np.ones((X_train.shape[0],1)),True)
X_val.insert(0,"ones",np.ones((X_val.shape[0],1)),True)
X_test.insert(0,"ones",np.ones((X_test.shape[0],1)),True)

In [46]:
# Defining functions to compute the errors on a set different than the training set
def pred_error_gen_design_matrix(X1,y1,X2,y2,k):
    """
    Inputs-
    X1: training design matrix (pdf, N_train x (d+1)) (already contains the ones column)
    y1: response vector for the cases in training dataset (pdf, N_train x 1)
    X2: design matrix to compute the error for the trained model(N_val x (d+1)) (pdf,already contains the ones column)
    y2: response vector for the cases in X2 (pdf, N_val x 1)
    k: Number of features to select out of the design matrix (must be greater than or equal to 1)
    
    Output-
    Gives the validation error for the trained model for a given k
    """
    # Converting the pandas dataframes into tractable numpy arrays
    data_train = X1.values
    resp_train = y1.values
    data_comp_error = X2.values
    resp_comp_error = y2.values
    
    # Fitting the regression model
    a = np.linalg.inv(np.matmul(np.transpose(data_train[:,0:k+1]),data_train[:,0:k+1]))
    b = np.dot(np.transpose(data_train[:,0:k+1]),resp_train.reshape(-1,1))
    final_weight = np.dot(a,b)
    
    # Computing the validation error
    y_pred = np.dot(data_comp_error[:,0:k+1],final_weight)
    err_vec = (y_pred.reshape(-1,))*((resp_comp_error).reshape(-1,))
    pred_error = np.sum(err_vec < 0)/data_comp_error.shape[0]
    
    return pred_error 

In [50]:
# Computing the errors for different design sets
train_val_error_list = []
train_test_error_list = []
val_train_error_list = []
val_test_error_list = []
k = np.arange(3,8)
for i in k:
    pred_error = pred_error_gen_design_matrix(X_train,y_train,X_val,y_val,i)
    train_val_error_list.append(pred_error)
    pred_error = pred_error_gen_design_matrix(X_train,y_train,X_test,y_test,i)
    train_test_error_list.append(pred_error)
    pred_error = pred_error_gen_design_matrix(X_val,y_val,X_train,y_train,i)
    val_train_error_list.append(pred_error)
    pred_error = pred_error_gen_design_matrix(X_val,y_val,X_test,y_test,i)
    val_test_error_list.append(pred_error)
print("Minimum validation error after learning the model from the training set is found for k = ",k[train_val_error_list.index(min(train_val_error_list))])
print("Minimum test error after learning the model from the training set is found for k = ",k[train_test_error_list.index(min(train_test_error_list))])
print("Minimum error on the training set after learning the model from the validation set is found for k = ",k[val_train_error_list.index(min(val_train_error_list))])
print("Minimum test error after learning the model from the validation set is found for k = ",k[val_test_error_list.index(min(val_test_error_list))])

Minimum validation error after learning the model from the training set is found for k =  6
Minimum test error after learning the model from the training set is found for k =  7
Minimum error on the training set after learning the model from the validation set is found for k =  6
Minimum test error after learning the model from the validation set is found for k =  6


In [56]:
train_val_error_list

[0.3, 0.5, 0.2, 0.0, 0.1]

In [57]:
train_test_error_list

[0.42, 0.416, 0.184, 0.084, 0.072]

In [58]:
val_train_error_list

[0.28, 0.36, 0.2, 0.08, 0.12]

In [59]:
val_test_error_list

[0.396, 0.388, 0.284, 0.192, 0.196]

From the above error vectors we can draw following conclusions:
<br>
- <font size="3">When we train our model on the training set and compute the error on the validation set which only has 10 points we cannot get a good estimate of Eout. This is reflected in the fact that even though the validation error when we select k features is observed to be 0 the error on the test set is observed to be 0.084. Also, the errors on the test set are not in alignment with the ones observed on the validation set, this is expected.</font>  
<br>  
- <font size="3">When we reverse the roles of training and validation set we observe that Eouts are on the higher side for a complex set of features when compared to the usual training scenario but we are tracking Eout pretty closely through the validation window (which in this case is the training set). This is due to the fact that as number of examples have increased the variance in estimating Eout has reduced considerably but since the number of training examples have also reduced we are estimating a poor quantity.</font>