# Synthetic Data Generation

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Synthetic Data Generation
## This part of the starter code generates a synthetic regression dataset that
## will be used for comparing Ridge and Lasso regression. The output is
## governed by a polynomial y= w_0+w_1x+w_2x^2+...+w_7x^7. x is generated 
## randomly from a uniform distribution. We have also defined the w vector to 
## obtain the outputs. The dependent variable of training dataset is slightly 
## corrupted by adding some noise sampled from a normal distribution. You can 
## play around with this code to understand the dataset, but this is not the lab's focus.
## The code could also be implemented using Numpy Polynomial Library.

# fix the seed for the random function
np.random.seed(2)

# define the size of the train and test sets
train_size=25
test_size=50

## listing the independent and dependent variables which makes it easy to use with 
## data frames later
i_v=["X_1", "X_2", "X_3", "X_4", "X_5", "X_6", "X_7"]
d_v=["Y"]

# define the coefficients of the polynomial
w_opt=np.array([1, 0, 2, 0, 0, 0, 0])

# create an array of numbers sampled from a uniform distribution for the training set
x = np.sort(np.random.uniform(low=-5, high=5, size=train_size))

# creating the training data frame and appending the independent variables
train_data = pd.DataFrame()
train_data["X_1"] = x
train_data["X_2"] = x**2
train_data["X_3"] = x**3
train_data["X_4"] = x**4
train_data["X_5"] = x**5
train_data["X_6"] = x**6
train_data["X_7"] = x**7

# sample a random noise vector
epislon = np.random.normal(loc=0.0, scale=30, size=train_size)

# estimate the training data output. This will serve as the ground truth.
train_data["Y"] = train_data.dot(w_opt)+epislon

# create an array of numbers sampled from a uniform distribution for the test set
x = np.sort(np.random.uniform(low=-4, high=4, size=test_size))

# creating the test data frame and appending the independent variables
test_data = pd.DataFrame()
test_data["X_1"] = x
test_data["X_2"] = x**2
test_data["X_3"] = x**3
test_data["X_4"] = x**4
test_data["X_5"] = x**5
test_data["X_6"] = x**6
test_data["X_7"] = x**7

# estimate the test data output. This will serve as the ground truth.
test_data["Y"] = test_data.dot(w_opt)

# ploting the training and test data
plt.figure(1)
plt.scatter(train_data[["X_1"]], train_data[["Y"]], c="red", label='train data')
plt.scatter(test_data[["X_1"]], test_data[["Y"]], c="blue", label='test data')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.title('Train and Test Data')

# Ridge Regression with Cross Validation (1 point)

In [None]:
## We will use the generated data to try Ridge regression and determine the optimal
## alpha value. Read about the RidgeCV class from
## https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
## and describe the following Parameters and attributes of the class
## 
## alphas:
##
## cv:
##
## store_cv_values:
##
## cv_values_
##
## coef_
##
## alpha_

from sklearn.linear_model import RidgeCV

# Define a range of alpha from 1e-2 to 100 increasing in multiples of 10.
# <add your code here>

# Define the number of cross validation folds
# <add your code here>

## Create an instance of the regression model using the parameters defined above
# <add your code here>

## Fit the model to the training set. Remember that we are performing
## multi-variate regression. The number of independent variables is > 1
# <add your code here>

## Obtain the value of the estimated regularization parameter alpha
# <add your code here>

## Display the coefficients of the regressor (weight vector)
# <add your code here>

## Compute the predictions for the training data and save it in a variable
## named y_pred_tr_ridge
# <add your code here>

## Compute the predictions for the test data and save it in a variable
## named y_pred_ridge
# <add your code here>

# Lasso Regression with Cross Validation (1 point)

In [None]:
## We will use the generated data to try Lasso regression and determine the optimal
## alpha value. Read about the LassoCV class from
## https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
## and describe the following Parameters and attributes of the class
## 
## alphas:
##
## cv:
##
## coef_
##
## alpha_

from sklearn.linear_model import LassoCV

# Define a range of alpha from 1e-2 to 100 increasing in multiples of 10.
# <add your code here>

# Define the number of cross validation folds
# <add your code here>

## Create an instance of the regression model using the parameters defined above
# <add your code here>

## Fit the model to the training set. Remember that we are performing
## multi-variate regression. The number of independent variables is > 1
# <add your code here>

## Obtain the value of the estimated regularization parameter alpha
# <add your code here>

## Display the coefficients of the regressor (weight vector)
# <add your code here>

## Compute the predictions for the training data and save it in a variable
## named y_pred_tr_lasso
# <add your code here>

## Compute the predictions for the test data and save it in a variable
## named y_pred_lasso
# <add your code here>

# Visualize the outputs(0.5 point)

In [None]:
## Let us visualize the regressors. We will create a single image containing
## two plots - one each for train and test set. Label the plots appropriately.
# <add your code here>

# Comment on the two models (0.5 point)


*  Is there any qualitative difference between the predictions from the plots?
*  Do you notice any difference in the optimal alpha values?
*  Do the weight vectors (coefficients) reveal anything?






# Cross Validation for Hyper-Parameter Tuning

# Loading a dataset part of the SKLearn Package (0.5 point)

In [None]:
## We will use the diabetes dataset included as part of the sklearn package for this
## part of the lab. Write the necessary code to load the dataset into the workspace
## as a tuple consisting of the independent and dependent variable values
##
## Add a text block to describe the dataset
## - Number of data points
## - Number and list of attributes
## - Identify categorical (discrete) and continuous attributes
from sklearn import datasets
(X, y) = datasets.load_diabetes(return_X_y=True)

# Grid Search CV (1.5 points)

In [None]:
## We will use the grid search strategy along with cross validation to
## select the optimal hyper-parameter values for Ridge regression and LASSO.
## We will also measure the time spent for performing this search.

# import that time package to call functions related to time measurement
import time
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
## Read the documentation for SKlearn model selection class GridSearchCV
## https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
## Describe the following parameters and attributes of the class in your words.
## estimator:
##
## param_grid:
##
## refit:
##
## cv:
##
## scoring:
##
## cv_results_:
##
## best_estimator_:
##
## best_params_:
##
## 

# list of models we want to test
models = []
models.append(('ridge', Ridge()))
models.append(('lasso', Lasso()))

## define a dictionary with hyper-parameter names as keys and a list of 
## permissible values for the hyper-parameter
# <add your code here>

# loop through the two models
for name, model in models:
    
    print(name) # print the model considered for grid search
    
    # create an instance of the GridSearchCV class for perorming a 5-fold CV to estimate the optimal value of alpha
    # scoring function should be negative mean squared error
    # use the default number of cross validation folds
    # set verbose to a level that will enable checking the intermediate outputs
    # <add your code here>

    # note the time at the start of the search
    start_time = time.time()

    # fitting the model for grid search
    # <add your code here>

    # print best parameter after tuning
    # <add your code here>
    print("Time taken for hyper-parameter tuning is: %s seconds" % (time.time() - start_time))
    print()

ridge
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ...................alpha=1e-05;, score=-2779.958 total time=   0.0s
[CV 2/5] END ...................alpha=1e-05;, score=-3028.874 total time=   0.0s
[CV 3/5] END ...................alpha=1e-05;, score=-3237.588 total time=   0.0s
[CV 4/5] END ...................alpha=1e-05;, score=-3008.659 total time=   0.0s
[CV 5/5] END ...................alpha=1e-05;, score=-2910.266 total time=   0.0s
[CV 1/5] END ..................alpha=0.0001;, score=-2780.281 total time=   0.0s
[CV 2/5] END ..................alpha=0.0001;, score=-3029.146 total time=   0.0s
[CV 3/5] END ..................alpha=0.0001;, score=-3236.588 total time=   0.0s
[CV 4/5] END ..................alpha=0.0001;, score=-3008.383 total time=   0.0s
[CV 5/5] END ..................alpha=0.0001;, score=-2910.791 total time=   0.0s
[CV 1/5] END ...................alpha=0.001;, score=-2783.445 total time=   0.0s
[CV 2/5] END ...................alpha=0.001