In [None]:
!curl http://www.datasciencecourse.org/assignments/hw3_linear.tar.gz --output hw3_linear.tar.gz
!tar -xzf hw3_linear.tar.gz
!mv hw3_linear/* ./
!pip install git+https://github.com/locuslab/mugrade.git

# Linear Regression

In this homework we are going to apply linear regression to the problem of predicting developer satisfaction based upon information about their carrers, from a StackOverflow survey.  The data from this question is based on the [2019 StackOverflow Survey](https://insights.stackoverflow.com/survey/2019); accordingly, the subset bundled with this assignment is also released under the Open Database License (ODbL) v1.0.  For this problem, you should not use Scikit-Learn, but instead implement all the least squares solutions manually.


In [1]:
import csv
import gzip
import math
import hashlib
import numpy as np
from pprint import pprint

from testing.testing import test

### Q1 Data Parsing

The data from this question is based on the [2019 StackOverflow Survey](https://insights.stackoverflow.com/survey/2019); accordingly, the subset bundled with this assignment is also released under the Open Database License (ODbL) v1.0.

The data was made by selecting some columns from the original dataset, only retaining rows from people who described themselves as "a developer by profession", and replaced long responses with shorter strings. Lets begin by examining the data.

In [2]:
def read_csv_test(read_csv):
    headers, rows = read_csv()
    test.equal(len(rows), 65679)
    test.equal(len(headers), 26)

    # Print a row:
    pprint(dict(zip(headers, rows[0])))
    
@test
def read_csv(fn="eggs.csv.gz"):
    """read the GZipped CSV data and split it into headers and newlines.
    
    kwargs:
        fn : str -- .csv.gz file to read
    
    returns: Tuple[headers, body] where
      headers : Tuple[str] -- the CSV headers
      body : List[Tuple[str,...]] -- the CSV body
    """
    with gzip.open(fn, 'rt', newline="", encoding='utf-8') as f:
        csvobj = csv.reader(f)
        headers = next(csvobj)
        return headers, [tuple(row) for row in csvobj]

{'Age': '22',
 'CareerSat': 'vs',
 'CodeRevHrs': 'NA',
 'ConvertedComp': '61000',
 'Country': 'United States',
 'Dependents': 'n',
 'DevEnvironVSC': 'y',
 'DevTypeFullStack': 'n',
 'EdLevel': 'bachelors',
 'EduOtherMOOC': 'y',
 'EduOtherSelf': 'y',
 'Extraversion': 'y',
 'GenderIsMan': 'y',
 'Hobbyist': 'n',
 'MgrIdiot': 'very',
 'MgrWant': 'n',
 'OpSys': 'win',
 'OpenSourcer': 'never',
 'OrgSize': '100-499',
 'Respondent': '4',
 'Student': 'n',
 'UndergradMajorIsComputerScience': 'y',
 'UnitTestsProcess': 'n',
 'WorkWeekHrs': '80',
 'YearsCode': '3',
 'YearsCodePro': '0'}
### TESTING read_csv: PASSED 2/2
###



| Column | Sample | Does/is the respondent... | Type/Values |
| --- |:--- |:--- |:--- |
| **CareerSat** | 'vs' | satisfied with their career? | (`vd`, `sd`, `ne`, `ss`, `vs`) -- corresponding to ({very, slightly}, {satisfied, dissatisfied}) and neutral |
| **MgrWant** | 'n' | ...want to be a manager? | boolean |
| Age    | '22'   | age | integer     |
| CodeRevHrs | '2' | hours a week spent reviewing code | integer |
| ConvertedComp | '61000' | yearly compensation in 2019 USD | integer |
| Country | 'United States' | lives in country | string _(ignore in regression)_ |
| Dependents | 'n' | ...have children or other dependents. | boolean |
| DevEnvironVSC | 'y' | ...use Visual Studio Code | boolean |
| DevTypeFullStack | 'n' | ...identify as a full-stack developer | boolean |
| EdLevel | 'bachelors' | maximum education level | (`other`, `bachelors`, `masters`, `doctoral`) |
| EduOtherMOOC | 'y' | ...ever taken a Massively Open Online Course | boolean |
| EduOtherSelf | 'y' | ...ever taught themselves a new platform | boolean |
| Extraversion | 'y' | ...prefer in-person meetings to online meetings | boolean |
| GenderIsMan | 'y' | ...male | boolean |
| Hobbyist | 'n' | ...write code as a hobby? | boolean |
| MgrIdiot | 'very' | ...think their manager knows what they are doing? | (`NA`, `not`, `some`, `very`), in order of increasing confidence |
| OpSys | 'win' | which OS do they use? | (`win`, `mac`, `tux`, `NA`), for (Windows, Mac OSX, Linux-like, NA) |
| OpenSourcer | 'Never' | ...contribute to open-source projects? | (`never`, `year`, `month-year`, `month`), in increasing order of frequency |
| OrgSize | '100-499' | number of employees in organization? | (`NA`, `1`, `2-9`, `10-19`, `20-99`, `100-499`, `500-999`, `1,000-4,999`, `5,000-9,999`, `10,000+`) |
| Respondent | '4' | respondent ID from original data | integer _(ignore in regression)_ |
| Student | 'n' | ...currently a student? | boolean |
| UndergradMajorIsComputerScience | 'y' | ...majored in CS? | boolean |
| UnitTestsProcess | 'n' | ...use unit tests in their job? | boolean |
| WorkWeekHrs | '80' | hours a week worked | integer |
| YearsCode | 3 | years since first programming | integer |
| YearsCodePro | 0 | years programming professionally | integer |

 - _boolean_ : `y`/`NA`/`n` assigned to `+1.0`/`0.0`/`0.0`
 - _integer_ : convert to `float`, preserving value. `NA` equals `0.0`.
 - _string_ : not included in regression; we'll use it later
 - CareerSat: Map (`vd`, `sd`, `ne`, `NA`, `ss`, `vs`) to (-2.0, -1.0, 0.0, 0.0, 1.0, 2.0)
 - EdLevel: Map (`other`, `bachelors`, `masters`, `doctoral`) to (0.0, 1.0, 1.5, 2.0)
 - MgrIdiot: Map (`NA`, `not`, `some`, `very`) to (-1.0, -1.0, 0.0, 1.0)
 - OpSys: Map (`win`, `mac`, `NA`, `tux`, `BSD`) to (-1.0, 0.0, 0.0, 1.0, 1.0)
 - OpenSourcer : Map (`never`, `year`, `month-year`, `month`) to (0.0, 0.5, 1.0, 2.0)
 - OrgSize: Map each range "$a$-$b$" to the value $ln(a)$. Treat `NA` as `ln(1.0) = 0`. We are converting an exponentially distributed range to a linearly distributed one.

Remove the columns listed above as being ignored.

Some hints:
  1. Load the csv file with `pd.read_csv(filname, dtype=str, keep_default_na=False)` to ensure that you load all columns as text (so you can do your own preprocessing), and ignore pandas's default conversion to NaN values.
  2. Use the `.apply()` function in pandas to convert

In [3]:
def type_boolean_test(type_boolean):
    test.true(isinstance(type_boolean("y"), float))
    test.equal(type_boolean("y"), 1.0)
    test.equal(type_boolean("n"), 0.0)
    test.exception(lambda: type_boolean("5"))

@test
def type_boolean(c):
    if c == "y": return 1.0
    elif c == "n": return 0.0
    elif c == "NA": return 0.0
    raise ValueError(c)


import re

# Integer
def type_integer_test(type_integer):
    test.true(isinstance(type_integer("5"), float))
    test.equal(type_integer("3"), 3.0)
    test.equal(type_integer("0"), 0.0)
    test.equal(type_integer("-4"), -4.0)
    test.equal(type_integer("NA"), 0.0)
    test.exception(lambda: type_integer("yes"))

@test
def type_integer(c):
    if c == 'NA': return 0.0
    return float(c)


# CareerSat
def type_CareerSat_test(type_CareerSat):
    test.true(isinstance(type_CareerSat("vd"), float))
    test.equal(type_CareerSat("sd"), -1.0)
    test.equal(type_CareerSat("ne"), 0.0)
    test.equal(type_CareerSat("ss"), 1.0)
    test.equal(type_CareerSat("vs"), 2.0)
    test.exception(lambda: type_CareerSat("yes"))

@test
def type_CareerSat(c):
    if c == "vd": return -2.0
    elif c == "sd": return -1.0
    elif c == "ne": return 0.0
    elif c == "NA": return 0.0
    elif c == "ss": return 1.0
    elif c == "vs": return 2.0
    raise ValueError(c)


# EdLevel
def type_EdLevel_test(type_EdLevel):
    test.true(isinstance(type_EdLevel("other"), float))
    test.equal(type_EdLevel("bachelors"), 1.0)
    test.equal(type_EdLevel("masters"), 1.5)
    test.equal(type_EdLevel("doctoral"), 2.0)
    test.exception(lambda: type_EdLevel("yes"))

@test
def type_EdLevel(c):
    if c == "other": return 0.0
    elif c == "bachelors": return 1.0
    elif c == "masters": return 1.5
    elif c == "doctoral": return 2.0
    raise ValueError(c)


# MgrIdiot
def type_MgrIdiot_test(type_MgrIdiot):
    test.true(isinstance(type_MgrIdiot("NA"), float))
    test.equal(type_MgrIdiot("not"), -1.0)
    test.equal(type_MgrIdiot("some"), 0.0)
    test.equal(type_MgrIdiot("very"), 1.0)
    test.exception(lambda: type_MgrIdiot("yes"))

@test
def type_MgrIdiot(c):
    if c == "NA": return -1.0
    elif c == "not": return -1.0
    elif c == "some": return 0.0
    elif c == "very": return 1.0
    raise ValueError(c)


# OpenSourcer
def type_OpenSourcer_test(type_OpenSourcer):
    test.true(isinstance(type_OpenSourcer("never"), float))
    test.equal(type_OpenSourcer("year"), 0.5)
    test.equal(type_OpenSourcer("month-year"), 1.0)
    test.equal(type_OpenSourcer("month"), 2.0)
    test.exception(lambda: type_OpenSourcer("yes"))

@test
def type_OpenSourcer(c):
    if c == "never": return 0.0
    elif c == "year": return 0.5
    elif c == "month-year": return 1.0
    elif c == "month": return 2.0
    raise ValueError(c)


# OrgSize
def type_OrgSize_test(type_OrgSize):
    test.true(isinstance(type_OrgSize("1"), float))
    test.equal(type_OrgSize("NA"), 0)
    test.equal(type_OrgSize("2-9"), 0.6931471805599453)
    test.equal(type_OrgSize("100-499"), 4.605170185988092)
    test.equal(type_OrgSize("10,000+"), 9.210340371976184)
    test.exception(lambda: type_OrgSize("yes"))

@test
def type_OrgSize(c):
    if c == "NA": return 0
    c = re.sub(r",","",c)
    if (re.match(r"[0-9]*\-[0-9]*",c)!=None):
        value = (int)(c.split('-')[0])
        return np.log(value)
    elif (re.match(r"[0-9]+\+",c)!=None):
        value = (int)(c.split('+')[0])
        return np.log(value)
    elif (re.match(r"[0-9]+",c)!=None):
        value = (int)(c)
        return np.log(value)
    raise ValueError(c)

### TESTING type_boolean: PASSED 4/4
###



Now we use these to convert the data into floating-point numbers. 

This is also where we deal with `OpSys`; from the one column in the source, create three columns (called `OpSysWin`, `OpSysMac`, and `OpSysTux`, corresponding to the values `win`, `mac`, `tux`. Other values can be ignored.) For each row, at most one of the cells must be 1.0, and the others must be 0.0. If the value in the cell is `NA`, then all the cells must be 0.0.

This is called a [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) and is a common way to handle category variables.

In [5]:
def quick_checksum(st):
    return hashlib.md5((" ".join(sorted(st))).encode()).hexdigest()

def convert_data_stackoverflow_test(convert_data_stackoverflow):
    headers, rows = convert_data_stackoverflow(*read_csv())
    # If this test fails, your headers are incorrect:
    test.equal(quick_checksum(headers), "5f9fc24f9e8dae961c2a778f493940ed")
    test.equal(set(headers), {'CareerSat', 'MgrWant', 'Age', 'CodeRevHrs', 'ConvertedComp', 'Dependents', 
                              'DevEnvironVSC', 'DevTypeFullStack', 'EdLevel', 'EduOtherMOOC', 'EduOtherSelf', 
                              'Extraversion', 'GenderIsMan', 'Hobbyist', 'MgrIdiot', 'OpSysWin', 'OpSysMac', 
                              'OpSysTux', 'OpenSourcer', 'OrgSize', 'Student', 'UndergradMajorIsComputerScience',
                              'UnitTestsProcess', 'WorkWeekHrs', 'YearsCode', 'YearsCodePro'})

    # Type check:
    test.true(all(all(isinstance(v, float) for v in r) for r in rows))
    # Operating System columns:
    for row in rows:
        d = dict(zip(headers, row))
#         print(d["OpSysWin"])
        if sorted([d["OpSysWin"], d["OpSysMac"], d["OpSysTux"]]) not in [[.0, .0, 1.], [0.]*3]:
            test.true(False)
            break
    else:
        test.true("There is correctly at most one OpSys* column set to 1.0")
        
    # More direct tests
    test.equal(dict(zip(headers, rows[-2])), {'CareerSat': -1.0, 'MgrWant': 1.0, 'Age': 0.0, 'CodeRevHrs': 5.0, 'ConvertedComp': 588012.0, 'Dependents': 1.0, 'DevEnvironVSC': 1.0, 'DevTypeFullStack': 1.0, 'EdLevel': 1.5, 'EduOtherMOOC': 0.0, 'EduOtherSelf': 0.0, 'Extraversion': 0.0, 'GenderIsMan': 1.0, 'Hobbyist': 1.0, 'MgrIdiot': -1.0, 'OpSysWin': 0.0, 'OpSysMac': 0.0, 'OpSysTux': 1.0, 'OpenSourcer': 0.0, 'OrgSize': 4.605170185988092, 'Student': 1.0, 'UndergradMajorIsComputerScience': 1.0, 'UnitTestsProcess': 1.0, 'WorkWeekHrs': 40.0, 'YearsCode': 10.0, 'YearsCodePro': 8.0})
    test.equal(dict(zip(headers, rows[-1])), {'CareerSat': -1.0, 'MgrWant': 0.0, 'Age': 33.0, 'CodeRevHrs': 0.0, 'ConvertedComp': 22915.0, 'Dependents': 0.0, 'DevEnvironVSC': 0.0, 'DevTypeFullStack': 0.0, 'EdLevel': 1.0, 'EduOtherMOOC': 0.0, 'EduOtherSelf': 0.0, 'Extraversion': 1.0, 'GenderIsMan': 1.0, 'Hobbyist': 0.0, 'MgrIdiot': -1.0, 'OpSysWin': 0.0, 'OpSysMac': 0.0, 'OpSysTux': 1.0, 'OpenSourcer': 2.0, 'OrgSize': 2.995732273553991, 'Student': 0.0, 'UndergradMajorIsComputerScience': 1.0, 'UnitTestsProcess': 1.0, 'WorkWeekHrs': 48.0, 'YearsCode': 9.0, 'YearsCodePro': 5.0})

    

@test
def convert_data_stackoverflow(headers, data):
    """convert the data into 
    
    args:
        header : List[str] -- the header for each column in the CSV
        data : List[Tuple[str]] -- the CSV data, where each inner list corresponds to a row in the CSV file.
 
    returns: Tuple[headers, body] where
      headers : List[str] -- the new headers, dropping the Country and Respondent headers and expanding 
      body : List[List[str,...]] -- the CSV body
    """
    body = []
#     print(headers)
    i_CareerSat = headers.index('CareerSat')
    i_MgrWant = headers.index('MgrWant')
    i_Hobbyist = headers.index('Hobbyist')
    i_OpenSourcer = headers.index('OpenSourcer')
    i_Student = headers.index('Student')
    i_EdLevel = headers.index('EdLevel')
    i_UndergradMajor = headers.index('UndergradMajorIsComputerScience')
    i_EduOtherMOOC = headers.index('EduOtherMOOC')
    i_EduOtherSelf = headers.index('EduOtherSelf')
    i_OrgSize = headers.index('OrgSize')
    i_DevTypeFullStack = headers.index('DevTypeFullStack')
    i_YearsCode = headers.index('YearsCode')
    i_YearsCodePro = headers.index('YearsCodePro')
    i_MgrIdiot = headers.index('MgrIdiot')
    i_ConvertedComp = headers.index('ConvertedComp')
    i_WorkWeekHrs = headers.index('WorkWeekHrs')
    i_CodeRevHrs = headers.index('CodeRevHrs')
    i_UnitTestsProcess = headers.index('UnitTestsProcess')
    i_DevEnvironVSC = headers.index('DevEnvironVSC')
    i_OpSys = headers.index('OpSys')
    i_Extraversion = headers.index('Extraversion')
    i_Age = headers.index('Age')
    i_GenderIsMan = headers.index('GenderIsMan')
    i_Dependents = headers.index('Dependents')
    for d in data:
        body.append([type_CareerSat(d[i_CareerSat])])
    for i, d in enumerate(data):
        body[i].extend([type_boolean(d[i_MgrWant]),
                        type_boolean(d[i_Hobbyist]),
                        type_OpenSourcer(d[i_OpenSourcer]),
                        type_boolean(d[i_Student]),
                        type_EdLevel(d[i_EdLevel]),
                        type_boolean(d[i_UndergradMajor]),
                        type_boolean(d[i_EduOtherMOOC]),
                        type_boolean(d[i_EduOtherSelf]),
                        type_OrgSize(d[i_OrgSize]),
                        type_boolean(d[i_DevTypeFullStack]),
                        type_integer(d[i_YearsCode]),
                        type_integer(d[i_YearsCodePro]),
                        type_MgrIdiot(d[i_MgrIdiot]),
                        type_integer(d[i_ConvertedComp]),
                        type_integer(d[i_WorkWeekHrs]),
                        type_integer(d[i_CodeRevHrs]),
                        type_boolean(d[i_UnitTestsProcess]),
                        type_boolean(d[i_DevEnvironVSC]),
                        type_boolean(d[i_Extraversion]),
                        type_integer(d[i_Age]),
                        type_boolean(d[i_GenderIsMan]),
                        type_boolean(d[i_Dependents])])
    for i, d in enumerate(data):
        if (d[i_OpSys] == "win"):
            body[i].extend([1.0, 0.0, 0.0])
        elif (d[i_OpSys] == "mac"):
            body[i].extend([0.0, 1.0, 0.0])
        elif (d[i_OpSys] == "tux"):
            body[i].extend([0.0, 0.0, 1.0])
        else:
            body[i].extend([0.0, 0.0, 0.0])
    headers.remove("Country")
    headers.remove("Respondent")
    headers.remove("OpSys")
    headers.append("OpSysWin")
    headers.append("OpSysMac")
    headers.append("OpSysTux")
#     print(headers)

#     print(data[0])
#     print(body[0])
#     print(data[100])
#     print(body[100])
        
    return headers, body

### TESTING convert_data_stackoverflow: PASSED 5/6
# 2	: Assertion failed
###



### Q2 Splitting Data

Now we prepare the converted data for regression. In this step, we:

 1. split this into training and validation sets,
 2. convert it to a Numpy `ndarray` with underlying type `np.float32`,
 3. split each set into the predicted columns and the feature columns.

We will save the first 20% of the dataset (rounded down) as the validation set and keep the remaining as the training set. (Note that it is common practice to randomize the dataset; this has already been done. Don't shuffle the dataset for this assignment.)

Ensure that the underlying type of the `ndarray` is `np.float32`, not the default `np.float64`. We do not need the added precision of 64-bit floating point numbers for this problem, and using the smaller numbers will speed up computation and reduce the amount of memory we need.

In [6]:
def split_data_test(split_data):
    headers, rows = convert_data_stackoverflow(*read_csv())
    l = len(rows)
    
    val, train = split_data(rows)
    test.equal(len(val), l // 5)
    test.true(isinstance(val, np.ndarray))
    test.equal(val.dtype, np.float32)
    test.equal(len(train), l - (l // 5))
    test.true(isinstance(train, np.ndarray))
    test.equal(train.dtype, np.float32)

@test
def split_data(data):
    """split the data into training and validation sets, and convert them to np.ndarray. (Step 1 and 2 above.)

    args:
        data : List[List[str]] -- the CSV data, where each inner list corresponds to a row in the CSV file.

    returns: Tuple[val, train] where
      val  : np.ndarray[num_val_rows, num_features] -- the first 20% of the dataset (rounded down)
      train : np.ndarray[num_train_rows, num_features] -- the remaining rows from data
    
    Ensure that the underlying type of the output is np.float32, not the default np.float64.
    """

    val = np.array([data[i] for i in range(0, len(data)//5)]).astype('float32')
    train = np.array([data[i] for i in range(len(data)//5, len(data))]).astype('float32')
    return val, train

### TESTING split_data: PASSED 6/6
###



In [7]:
def separate_objective_test(separate_objective):
    headers, rows = convert_data_stackoverflow(*read_csv())
    val, train = split_data(rows)

    for subset in [val, train]:
        subset_headers, subset_features, subset_objectives = separate_objective(headers, subset, ["CareerSat", "MgrWant"])

        test.true(isinstance(subset_objectives, tuple))
        test.equal(len(subset_objectives), 2)
        test.true("CareerSat" not in subset_headers)
        test.true("MgrWant" not in subset_headers)
        test.equal(subset_features.shape[1], 24)

@test
def separate_objective(headers, data, objectives):
    """split the objective columns from the headers and data. (Step 1 and 2 above.)

    args:
        headers    : List[str] -- the headers for the data, used to find the objective columns from the data array
        data       : np.ndarray[num_rows, num_columns] -- the data
        objectives : the columns to extract from the data

    returns: Tuple[o_headers, o_features, o_objectives] where
      o_headers  : List[str] -- a list of headers without the objective columns
      o_features : np.ndarray[num_train_rows, num_features] -- the remaining columns from data. (num_features = num_columns - len(objectives))
      o_objectives : Tuple[np.ndarray[num_train_rows], ...] -- a list of objective columns from the data, each element is a 1-dimensional np.ndarray corresponding to the entry in objectives.
     """


    o_headers = headers[:]
    o_features = data[:]
    objs = ["" for i in range(0, len(objectives))]

    for i, o in enumerate(objectives):
        if o in o_headers:
            objs[i] = np.array([f[o_headers.index(o)] for f in o_features])
#             print(objs)
            o_features = np.delete(o_features, o_headers.index(o), axis=1)
            o_headers.remove(o)
    o_objectives = tuple(objs)

    return o_headers, o_features, o_objectives

### TESTING separate_objective: PASSED 10/10
###



### Q3 Linear Regression

Now you will finally implement a linear regression. As a reminder, linear regression models the data as

$$\mathbf y = \mathbf X\mathbf \beta + \mathbf \epsilon$$

where $\mathbf y$ is a vector of outputs, $\mathbf X$ is also known as the design matrix, $\mathbf \beta$ is a vector of parameters, and $\mathbf \epsilon$ is noise. We will be estimating $\mathbf \beta$ using Ordinary Least Squares, and we recommending following the matrix notation for this problem (https://en.wikipedia.org/wiki/Ordinary_least_squares).

You are not allowed to use `scipy` in your submission for this assignment, but you are encouraged to use it to test your solution. Make sure that you only ever `import scipy` inside a `_test` function.

Hints:

 1. You should use `np.linalg.solve` to calculate `beta`.
 2. Feel free to add `1e-4*np.eye(...)` to the coefficient matrix

In [10]:
class LinearRegression():
    """ Perform linear regression and predict the output on unseen examples. 
    
    attributes: 
        beta (np.ndarray) : vector containing parameters for the features
    """

    def train(self, X, y):
        """ Train the linear regression model by computing the estimate of the parameters
        You should store the model parameters in self.beta, overwriting parameters as necessary.

        args: 
            X (np.ndarray[num_examples, num_columns]) : matrix of training data
            y (np.ndarray[num_examples]) : vector of output variables

        return: LinearRegression -- returns itself (for convenience)
        """

        self.beta = np.zeros(X.shape[1])
        self.beta = np.linalg.solve(X.T @ X + 1e-4*np.eye(X.shape[1]), X.T @ y)

        return self

    def predict(self, X_p): 
        """ Use the learned model to predict the output of X_p

        args: 
            X_p (np.ndarray[num_examples, num_columns]) matrix of test/validation data where each row corresponds to an example

        return: 
            (np.ndarray[num_examples]) vector of predicted outputs
        """
        pred = X_p @ self.beta
        return pred

In [11]:
# Don't remove this function; we use it for the auto-grader.
@test
def linear_regression_instance():
    return LinearRegression()


def linear_regression_instance_test(linear_regression_instance):
    lr = linear_regression_instance()


    # If this throws a Singular Matrix error, your smoothing is bad:
    test.equal(lr.train(np.zeros((20, 5)), np.ones((20,))).beta.tolist(), [0.0]*5)
    

    # Basic functionality tests:
    test.equal(lr.train(np.eye(6)*(1-1e-4), np.ones((6,))).beta.round(4).tolist(), [1.0]*6)
    test.equal(lr.train(np.array([[0., 1.], [1., 2.], [2., 3.]]), np.array([1., 2., 3.])).beta.round(4).tolist(), [0.0001, 0.9999])

### TESTING linear_regression_instance: PASSED 3/3
###



## Q4 Evaluation versus baselines

As a final consideration, If you implement this properly, you will see that we get a squared error of around 1.3 on the validation set.  Is this "good"?  This is one of the more subtle points of many data science problems, but we can start to get some sense of this by looking at what the predictions would look like if we _just_ predicted the mean target output on the training set (this is essentially the "simplest" prediction we could make, if we didn't look at the features at all).

Implement the following function to evaluate our linear regression.  Think about what this signifies about the quality of the solution.

In [12]:
def mean_squared_error_test(mean_squared_error):
    test.equal(mean_squared_error(np.ones(10), np.ones(10)), 0)
    test.equal(mean_squared_error(np.ones(10), np.zeros(10)), 1)

@test
def mean_squared_error(pred, ground_truth):
    """ calculate the mean mean-squared-error between pred and ground_truth
    
    args:
      pred : np.ndarray[num_examples] -- the predictions
      ground_truth : np.ndarray[num_examples] -- the ground truth values
      
    returns: float -- the average mean-squared-error between predictions and ground_truth values.
    """

    sum = 0
    for i in range(0, len(pred)):
        sum += np.square(pred[i] - ground_truth[i])
    return sum/len(pred)





### TESTING mean_squared_error: PASSED 2/2
###

