In [None]:
# Unless a problem instructs differently, you may use any functions available from the following library imports
import numpy as np
from scipy import stats
import sklearn
import statsmodels.api as sm
from scipy.special import expit as invlogit
import matplotlib.pyplot as plt

# Problem 2 (5 points)

Provide a model fit that is robust against outliers by defining the function `huber_SLR(x, y, c, K=10, eps=1e-7)` (for simple linear regression) which implements ***M estimation*** for ***Huber loss*** with the ***IRLS*** algorithm where the weight of a data point is $1$ if $|y_i-x_i^t\beta^{(t)}| \leq c$ and $0$ otherwise.

***Hints:*** 

- Your `huber_SLR` function will be tested directly using data simiar to the example below.
- Use the `OLS` function rather than the `WLS` function by subsetting the data to only the points with weight $1$.
- This algorithm is specified in the course notes as well as in Keith Knight's STA410 [notes14.pdf document](https://q.utoronto.ca/courses/296804/files?preview=25407629).

In [None]:
n = 100
np.random.seed(1)
x = stats.norm.rvs(size=n)
y = 5*x + (1+np.abs(x))*stats.norm().rvs(size=n) 
fig,ax = plt.subplots(1,2,figsize=(10,5))
ax[0].plot(x,y,'.', label="Clean Data")

# https://www.statsmodels.org/dev/generated/statsmodels.regression.linear_model.OLS.html
X = sm.add_constant(x)
model = sm.OLS(y,X)
fit = model.fit()
support = np.linspace(-3,3,20)
ax[0].plot(support,fit.predict(sm.add_constant(support)),
           label="Clean Data Model Fit")
ax[0].legend()

n_corrupted = 10
np.random.shuffle(x[:n_corrupted])
ax[1].plot(support,fit.predict(sm.add_constant(support)),
           label="Clean Data Model Fit")
ax[1].plot(x,y,'.', label="Corrupted Data")
X = sm.add_constant(x)
model = sm.OLS(y,X)
fit = model.fit()
support = np.linspace(-3,3,20)
ax[1].plot(support,fit.predict(sm.add_constant(support)),
           label="Corrupted Data Model Fit")
ax[1].legend()

In [None]:
def huber_SLR(x, y, c, K=10, eps=1e-7)
    '''
        Fits a simple linear regression y=ax+b using 
        huber loss with arbitrary tuning parameter c
        
        x:   (np.array)  independent variable
        y:   (np.array)  dependent variable
        c:   (float)     |y_i-yhat_i|>c makes w_i=0; otherwise, w_i=1
        K:   (int)       maximum IRLS steps
        eps: (float)     stopping criterion returns IRLS fit at step k
                         if ||(a_k,b_k)-(a_{k-1},b_{k-1})||_2^2 < eps
                         
        returns IRLS fit (statsmodels OLS) object
                after K steps of when eps stopping criterion is met
    '''

    X = sm.add_constant(x); model = sm.OLS(y,X); fit = model.fit()
    params = fit.params
    
    # Complete the K-step Huber loss IRLS algorithm updating `fit`
    # incorporating an eps-based easly stopping criterion
    
    return fit

## Problem 2 question 0-2 (2 points)

0-2. Your `huber_SLR` will be tested on the data above for various choices of `c`, `K`, and `eps`.

- You do not need to assign any variables for this problem -- your `huber_SLR` function will be called directly.

## Problem 2 question 3 (1 point)

3. Which of the following is the smallest integer value for $c$ which first makes the simple linear regression fit of the `huber_SLR` extremely similar to the "Clean Data Model Fit" for the data ?

    1. 1
    2. 2
    3. 3
    4. 4

***Hint:*** if you replace `ax[1].legend()` with 

```python
huber_fit = huber_SLR(x,y,c=2,K=10)
ax[1].plot(support,huber_fit.predict(sm.add_constant(support)),
           label="Huber Loss Corrupted Data Model Fit")
ax[1].legend()
```

in the plotting demonstration code above you can see the simple linear regression line fit with the ***Huber loss*** function. 

In [None]:
# 1.0 point [format: `str` either "A" or "B" or "C" or "D" based on the choices above]
p2q3 = #<"A"|"B"|"C"|"D"> 
# Uncomment the above and keep each only either "A" or "B" or "C" or "D"

# This cell will produce a runtime error until the `p2q3` variable is assigned a value

## Problem 2 question 4-6 (1 point)

4-6. Define a new function `huber_MLR` which generalizes the simpler linear regression function `huber_SLR` to accept a multivariate design matrix `X` rather than a vector `x`. When defining `huber_MLR` follow the specifications given in the starter code below.

- You do not need to assign any variables for this problem -- your `huber_MLR` function will be tested directly for some design matrix `X` and various choices of `c`, `K`, and `eps`.

In [None]:
def huber_MLR(x, y, c, K=10, eps=1e-7):
    '''
        Fits a multivariate linear regression y = X beta using 
        huber loss with arbitrary tuning parameter c
        
        X(n,p): (np.array)  design matrix (intercept will not be added)
        y(n,):  (np.array)  dependent variable
        c:      (float)     |y_i-yhat_i|>c makes w_i=0; otherwise, w_i=1
        K:      (int)       maximum IRLS steps (default K=10)
        eps:    (float)     stopping criterion returns IRLS fit at step k
                            if ||(a_k,b_k)-(a_{k-1},b_{k-1})||_2^2 < eps (default eps=1e-7)
                         
        returns IRLS fit (statsmodels OLS) object
                after K steps of when eps stopping criterion is met
    '''

    # Complete the K-step Huber loss IRLS algorithm updating `fit`
    # incorporating an eps-based easly stopping criterion
    
    model = sm.OLS(y,X); fit = model.fit()
    params = fit.params
    
    return fit

## Problem 2 questions 7-8 (1 point)

7. What is true about the ***M estimation*** for ***Huber loss*** in problem 2 and logistic regression model fitting in problem 1?

    1. They both substitue ***Fisher information*** for the expected value of the ***Hessian***
    2. Problem 2 specifies a minimization problem while problem 1 specifies a maximization problem
    3. Problem 1 is implemented using an ***IRLS*** algorithm while ***M estimation*** for ***Huber loss*** in problem 2 is not
    4. All of the above


8. For ***Huber loss*** fit with ***M estimation*** as above, which of the following is the same as $E[X^TWX]$, where 

   $$W_{ii}=\left\{\begin{array}{ll} 1 & |y_i-x_i^T\beta|\leq c\\0&\text{otherwise} \end{array}\right. \quad \text{ and } \quad W_{ij}=0 \text{ for } i\neq j$$ 

   and $c$ is the ***Huber loss*** function parameter and $y_i$ is assumed to be independently and identically distributed for all $i$?

    1. $\Pr(|y_i-x_i^T\beta|\leq c)X^TX$ 
    2. ***expected Fisher Information***
    3. ***observed Fisher Information***
    4. The inverse of the negative ***Hessian*** matrix
    5. All of the above

In [None]:
# 0.5 points each [format: `str` either "A" or "B" or "C" or "D" or "E" based on the choices above]
p2q7 = #<"A"|"B"|"C"|"D"> 
p2q8 = #<"A"|"B"|"C"|"D"|"E"> 
# Uncomment the above and keep each only either "A" or "B" or "C" or "D" or "E"

# This cell will produce a runtime error until the `p2q7` and `p2q8` variables are assigned values