## **Import data**

In [None]:
#Math libs
from math import sqrt
from scipy import stats
import os
# Data Science libs
import numpy as np
import pandas as pd
# Graphics libs
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Timers
!pip install pytictoc
from pytictoc import TicToc

## **Import Data**

In [None]:
# Import Data
df = pd.read_csv('/content/Credit_N400_p9.csv')

In [None]:
# Validate data import
df.head(3)

## **Data Pre Proccessing**

In [None]:
# Assign dummy variables to catigorical feature attributes
df = df.replace({'Male': 0, 'Female':1, 'No': 0, 'Yes': 1})
df.head(3)

In [None]:
# separate the predictors from the response
X = df.to_numpy()[:, :-1]
Y = df.to_numpy()[:, -1]
print('Convert dataframe to numpy array:', X.shape, Y.shape)

## **Set Global Variables**

In [None]:
# Set local variables
# 9-Tuning Parms
λ  = [1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6]

# 6 learning & convergence rate
α =  [0, 0.2, 0.4, 0.6, 0.8, 1]

# K-folds
k = 5

#Iterations
n_iters = 1000 # itterations

#log base of lambda
λ_log = np.log10(λ) 

# Set verbose to True
verbose = True

# Set n x m matrix variable
X_p = X

# Set n vector variable
Y_p  = Y

## **Instantiate Data**

In [None]:
# Randomize N x M and N data
def randomize_data(X_p, Y_p):
  matrix = np.concatenate((X_p, Y_p[:, None]), 1)
  np.random.shuffle(matrix)
  return matrix[:, :-1], matrix[:, -1]

In [None]:
# Initilize random sample data
x, y = randomize_data(X_p, Y_p)

In [None]:
# get number of samples and number of features
X1 = x.shape[0]
X2 = x.shape[1]

In [None]:
# create a 𝛽 matrix to store the predicted values 
𝛽 = np.zeros([k, len(λ), len(α), X2])

In [None]:
# Store 5 K-fold cross validation results 
CV = np.zeros([k, len(λ), len(α)])

In [None]:
# Compute the number of validation test samples and indices  based on k-folds 
test_x = X1 // k 
test_i = list(range(0, X1, test_x))
        
if True:
    print('Implemnting {} training by {} test validation samples for each 5-k CV fold.'.format(
        X1 - test_x, test_x)
    )

## **Implment Functions**

In [None]:
# Standardize  X
def standardize(x, mean_x, std_x):
  return (x - mean_x) / std_x 

In [None]:
# Center response variables
def centerResponses(y, mean):
  return y - mean

In [None]:
# predicit x
def predict(x):
  x = standardize(x, mean_x, std_x)
  return np.matmul(x, 𝛽x)

### **Coordinate Descent Algortihm**

In [None]:
# implement Coordinate Descent
def CoordinateDescent(x, y, 𝛽x, sum_sq, lamb, alpha):
  for k in range(X2):
    # RSS minus the k coefficient 
    RSS = y - np.matmul(x, 𝛽x) + (x[:, k] * 𝛽x[k])[:, None]
            
    # Calualte the RSS Loss function
    a_k = np.matmul(x[:, k].T, RSS)[0]
            
    # update B_k
    𝛽k = np.absolute(a_k) - lamb * (1 - alpha) / 2
    𝛽k = 𝛽k if 𝛽k >= 0 else 0
    𝛽x[k, 0] = np.sign(a_k) * 𝛽k / (sum_sq[k] + lamb * alpha)

  return 𝛽x

### **Elastic Net Cross Validation Algorithm**

In [None]:
def elasticNetCV(X, Y, λ , α, k, n_iters, verbose = True):
  for i_lambda, lamb in enumerate(λ): # loop through λ lambda
        for i_alpha, alpha in enumerate(α): # loop through α
            for i_fold, i_test in zip(range(k), test_i): # loop through folds

                # Get the folds and split the data into training and validation sets     

                x_test = x[i_test:i_test + test_x]

                x_train = np.delete(x, np.arange(i_test, i_test + test_x), axis = 0)

                y_test = y[i_test:i_test + test_x]

                y_train = np.delete(y, np.arange(i_test, i_test + test_x), axis = 0)


                # Standardize x and center y folds
                mean_x, std_x = np.mean(x_train, 0), np.std(x_train, 0)
               
                mean_res = np.mean(y_train)


                x_train = standardize(x_train, mean_x, std_x)

                x_test = standardize(x_test, mean_x, std_x)

                y_train = centerResponses(y_train, mean_res)[:, None]

                y_test = centerResponses(y_test, mean_res)[:, None]
                      
                # compute b_k given this fold 
                sum_sq = np.sum((x_train))**2
                # print('Sum of Square test', sum_sq)
              

                # initialize random 𝛽x for lambda and fold
                𝛽x = np.random.uniform(low = -1, high = 1, size = (X2, 1))


                # Iterate 1000 times through the beta values in Elastic Net algorithm
                for iter in range(n_iters):
                    𝛽x = CoordinateDescent(x_train, y_train, 𝛽x, sum_sq, lamb, alpha)
                
                # Score the models MSE
                y_hat = np.matmul(x, 𝛽x)
                mse_score = np.mean(y - y_hat)
              
                # store the score with the tuning param combinations
                CV[i_fold, i_lambda, i_alpha] = mse_score

                # store the coefficient vector
                𝛽[i_fold, i_lambda, i_alpha] = 𝛽x[:, 0]
                

                # if verbose flag, then print out the mean CV MSE for the combo of lambda and alpha
            if verbose:
               print('lambda:{}; alpha:{}; CV MSE:{}'.format(
                    lamb, alpha, np.mean(CV[:, i_lambda, i_alpha])))
                

In [None]:
en = elasticNetCV(X, Y, λ , α, k, n_iters, verbose = True)

In [None]:
for i_lambda, lamb in enumerate(λ): # loop through λ lambda
        for i_alpha, alpha in enumerate(α): # loop through α
            for i_fold, i_test in zip(range(k), test_i): # loop through folds

                # Get the folds and split the data into training and validation sets     

                x_test = x[i_test:i_test + test_x]

                x_train = np.delete(x, np.arange(i_test, i_test + test_x), axis = 0)

                y_test = y[i_test:i_test + test_x]

                y_train = np.delete(y, np.arange(i_test, i_test + test_x), axis = 0)


                # Standardize x and center y folds
                mean_x, std_x = np.mean(x_train, 0), np.std(x_train, 0)
               
                mean_res = np.mean(y_train)


                x_train = standardize(x_train, mean_x, std_x)

                x_test = standardize(x_test, mean_x, std_x)

                y_train = centerResponses(y_train, mean_res)[:, None]

                y_test = centerResponses(y_test, mean_res)[:, None]
                      
                # compute b_k given this fold 
                sum_sq = np.sum((x_train))**2
                # print('Sum of Square test', sum_sq)
              

                # initialize random 𝛽x for lambda and fold
                𝛽x = np.random.uniform(low = -1, high = 1, size = (X2, 1))


                # Iterate 1000 times through the beta values in Elastic Net algorithm
                for iter in range(n_iters):
                    𝛽x = CoordinateDescent(x_train, y_train, 𝛽x, sum_sq, lamb, alpha)
            
            
                # Score the models MSE
                y_hat = np.matmul(x, 𝛽x)
                mse_score = np.mean(y - y_hat)
              
                # store the score with the tuning param combinations
                CV[i_fold, i_lambda, i_alpha] = mse_score

                # store the coefficient vector
                𝛽[i_fold, i_lambda, i_alpha] = 𝛽x[:, 0]
                

                # if verbose flag, then print out the mean CV MSE for the combo of lambda and alpha
            if True:
               print('lambda:{}; alpha:{}; CV MSE:{}'.format(
                    lamb, alpha, np.mean(CV[:, i_lambda, i_alpha])))
                
          
############# Retrain on entire dataset with optimal lambda and alpha #############
# find the best lambda and alpha
        
cv_mean = np.mean(CV, 0)
best_λ_ind, best_alpha_ind = np.where(cv_mean == np.amin(cv_mean))

best_λ = λ[best_λ_ind[0]]
best_alpha = α[best_alpha_ind[0]]


# standardize features of x and center responses 
mean_x, std_x = np.mean(x, 0), np.std(x, 0)
x = standardize(x, mean_x, std_x)
y = centerResponses(y, np.mean(y))[:, None]
                                    

# compute the sum of squares for each feature on the entire dataset
sum_sq = np.sum(x ** 2, 0)
                                    
# initialize coefficients
𝛽x = np.random.uniform(low = -1, high = 1, size = (X2, 1))

# perform updates 
for iter in range(n_iters):
    𝛽x = CoordinateDescent(x, y, 𝛽x, sum_sq, best_λ, best_alpha)
    # print('Beta values updated test:', B) 
        


## **Devliverable 1**

In [None]:
# observe the coefficient values as a function of lambda for each alpha
# plotting mean coefficient vectors over the 5 folds

sns.set_theme()
sns.set_style("darkgrid", {"grid.color": ".5", "grid.linestyle": ":" })
𝛽μ  = np.mean(𝛽,0)
ŷ = df.columns
count = 0
t = TicToc()  # create instance of class

for i_alpha, alpha in enumerate(α):
    count += 1 
    end_time = t.toc()
    plt.figure()
    plt.figure(figsize=(16, 10), dpi=70)
    print('Tuning parameter converged at = #{c} λ {} at alpha{α}\n'.format(np.log10(λ), c=count,  α=alpha)) 
    for i_beta in range(𝛽μ.shape[1]):
        plt.plot( np.log10(λ), 𝛽μ[:, i_alpha, i_beta], label = ŷ[i_beta])
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 'upper right', title = 'Features')
    plt.xlabel('λ Tuning Params')
    plt.ylabel('Coefficient Values')
    plt.title('Alpha Value: {}'.format(alpha))
    plt.show()
    

In [None]:
print(CV)

### **Output for Deliverable 2**

In [None]:
# observe the CV MSE over values of lambda and alpha
plt.figure()
plt.figure(figsize=(16, 10), dpi=70)
for i_alpha, alpha in enumerate(α):
    std_error = np.std(CV[..., i_alpha], 0) / np.sqrt(k)
    plt.errorbar( np.log10(λ), np.mean(CV[..., i_alpha], 0), yerr = std_error,xuplims=True,label = str(alpha))
    plt.xlabel('Log base lambda')
    plt.ylabel('Cross Validation MSE')
    plt.legend(title = 'α')
plt.show()

### **Output for Deliverable 3**

In [None]:
# lambda and alpha with lowest cv mse
print('Best lambda: {}; Best alpha: {}'.format(best_λ, best_alpha))

### **Output for Deliverable 4**

In [None]:
# coefficient vector for optimal lambda given alpha = 0 (lasso)
# coefficicents look relatively similar to elastic net

# get cv mse given alpha = 0
cv_mse_alpha_0 = np.mean(CV[..., 0], 0)

# find index of lambda with lowest cv mse
lambda_ind = np.argmin(cv_mse_alpha_0)
lambda_optimal = λ[lambda_ind]

# get the mean coefficient vector under lambda and alpha for all 5 folds
B_mean = np.mean(𝛽x[:, lambda_ind, 0, :], 0)

# plot against B with optimal lambda and alpha
plt.scatter(B, B_mean)
plt.plot(np.arange(-300, 475), np.arange(-300, 475), '--', color = 'r')
plt.xlabel('Elastic Net (lambda = {}, alpha = {})'.format(best_λ, best_alpha))
plt.ylabel('Lasso (lambda = {})'.format(lambda_optimal))
plt.show()

### **Output for Deliverable 6**

In [None]:
# coefficient vector for optimal lambda given alpha = 1 (ridge)
# coefficients look similar to elastic net and lasso
# because best alpha was in the middle

# get cv mse given alpha = 1
cv_mse_alpha_1 = np.mean(CV[..., -1], 0)

# find index of lambda with lowest cv mse
lambda_ind = np.argmin(cv_mse_alpha_1)
lambda_optimal = λ[lambda_ind]

# get the mean coefficient vector under lambda and alpha for all 5 folds
B_mean = np.mean(𝛽x[:, lambda_ind, -1, :], 0)

# plot against B with optimal lambda and alpha
plt.scatter(B, B_mean)
plt.plot(np.arange(-300, 475), np.arange(-300, 475), '--', color = 'r')
plt.xlabel('Elastic Net (lambda = {}, alpha = {})'.format(best_λ, best_alpha))
plt.ylabel('Ridge (lambda = {})'.format(lambda_optimal))
plt.show()

In [None]:
# predict responses and compare against actual responses
y_hat = predict(x)
plt.scatter(y_hat, y)
plt.xlabel('Predicted Response')
plt.ylabel('Actual Response')
plt.show()